Libraries

Start with clearing environment and loading packages

set.seed(9) #set the seed
rm(list=ls()) #remove all current files in the environment

#load all necessary libraries
library(ggthemes)
library(pheatmap)
library(ggplot2)
library(matrixStats)
library(wesanderson)
library(clusterProfiler)
library(enrichplot)
library(msigdbr)
library(dichromat)
library(stringr)
library(dplyr)
library(ggrepel)
library(reshape2)
library(umap)
library(ggthemes)
library(cowplot)
library(DEP)
## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (1.0.10)
## than is installed on your system (1.0.11). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at 
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
library(naniar)
library(SummarizedExperiment)
library(data.table)
library(org.Hs.eg.db)
library(RColorBrewer)
library(readr)
library(readxl)
library(viridis)

Set working directories

# if you are using Rstudio run the following command, otherwise, set the working directory to the folder where this script is in
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# create directory for results
dir.create(file.path(getwd(),'results'), showWarnings = FALSE)
# create directory for plots
dir.create(file.path(getwd(),'plots'), showWarnings = FALSE)
# create directory for plots for the paper
dir.create(file.path(getwd(),'plots/paper'), showWarnings = FALSE)

Define colour palettes

#all colourblind friendly options
 display.brewer.all(n=NULL, type="all", select=NULL, exact.n=TRUE, 
                    colorblindFriendly=TRUE)

#create a list with all colours for the final paper figures
#each list element will be a named vector
final_colours = list(
  #red and blue for volcano plot
  volcano_plot = brewer.pal(n=10, "RdYlBu")[c(2,9)],
  male_female = brewer.pal(n=8, "Set2")[c(1,4)],
  #heatmap_scale = rev(brewer.pal(n = 11, "RdYlBu")),
  heatmap_scale = rev(brewer.pal(n=11, "RdBu")),
  clustering = brewer.pal(n=8, "Set2")[c(2,3,5)],
  age_scale = brewer.pal(n = 9, "YlGn"),
  disease_progression_scale = brewer.pal(n = 9, "YlOrRd"),
  onset = c(brewer.pal(n=11, "RdYlBu")[c(4,5)], "#B3B3B3"),
  disease_status = c(brewer.pal(n=11, "RdYlBu")[2],"#B3B3B3"),
  genetic_testing = c("#B3B3B3", brewer.pal(n = 11, "PRGn")[c(3, 9)]),
  center = c("purple4", "orange3"),
  neurofilaments = brewer.pal(n = 9, "PuBu"),
  pNFh_scale = brewer.pal(n = 9, "Purples"),
  age_at_onset_scale = brewer.pal(n = 9, "Blues"),
  slow_vital_capacity_scale = brewer.pal(n = 9, "Reds")
)

#give the vectors with discrete variables names
names(final_colours$volcano_plot) =  c("up", "down")
names(final_colours$male_female) = c("Male", "Female")
names(final_colours$clustering) = c("alpha", "beta", "theta")
names(final_colours$disease_status) = c("als", "ctrl")
names(final_colours$onset) = c("spinal", "bulbar", "ctrl")
names(final_colours$genetic_testing) = c("not_performed", "negative", "C9orf72")
names(final_colours$center) = c("goettingen", "munich")

Generate fake data

#GENERATING FAKE ABUNDANCY DATA
    set.seed(9)
    
    #take the protein names from the actual dataset
    protein_names = readRDS(file = "data/protein_names.rds")
      
    # Number of proteins and patients
    num_proteins <- length(protein_names)
    num_patients <- 100
    
    # Generate fake abundancy data
    abundance_data <- matrix(runif(num_proteins * num_patients, min = 0, max = 1000), nrow = num_proteins, ncol = num_patients)
    
    # Creating a pattern of differential expression
    pattern <- rep(c(rep(1, num_patients/2), rep(2, num_patients/2)), num_proteins/2)
    
    # Apply the pattern to abundance data
    abundance_data <- abundance_data * pattern
## Warning in abundance_data * pattern: longer object length is not a multiple of
## shorter object length
    # Select 5 proteins and make them drastically upregulated or downregulated in the first 50 patients compared to the second 50 patients
      for (i in 1:5) {
        protein_index <- sample(1:num_proteins, 1)  # Select a random protein
        fold_change <- runif(1, 4, 6)  # Adjust the fold change factor as needed
        abundance_data[protein_index, 1:50] <- abundance_data[protein_index, 1:50] * fold_change
      }
    
    # Adding some noise
    abundance_data <- abundance_data + rnorm(num_proteins * num_patients, mean = 0, sd = 0.5)
    
    # Introduce missing values randomly
    missing_percent <- 0.2  # Adjust this to set the percentage of missing values
    
    # Calculate the number of missing values to introduce
    num_missing <- round(num_proteins * num_patients * missing_percent)
    
    # Generate random indices to introduce missing values
    missing_indices <- sample(1:(num_proteins * num_patients), num_missing)
    
    # Replace values at missing indices with NA
    abundance_data[missing_indices] <- NA
    
    # Change negative values to positive
    abundance_data = abs(abundance_data)
    
    #make into dataframe and give normal name
    abu_data = as.data.frame(abundance_data)
    
    #colnames proteins and patients
    rownames(abu_data) = protein_names
    colnames(abu_data) = paste0("patient_", 1:num_patients)
    
    #specifically downregulate the following three proteins because they belong to the same pathway
    p = c("CHI3L2","CHIT1","CHI3L1")
    for(prot in p){
      fold_change = runif(1, min = 0.2, max = 0.3)
      abu_data[prot, 1:50] <- abu_data[prot, 1:50] * fold_change
    }
    
#GENERATING FAKE CLINICAL DATA
    
    clin = data.frame(
      patid = paste0("patient_", 1:num_patients),
      disease = as.factor(c(rep("als", 50), rep("ctrl", 50))),
      sex = as.factor(sample(c("Male", "Female"), num_patients, replace = TRUE)),
      age = runif(num_patients, min = 40, max = 80), 
      neurofilaments = runif(num_patients, min = 0, max = 1000),
      genetics = as.factor(sample(c("negative", "not_performed", "C9orf72"), num_patients, replace = TRUE)), 
      onset = as.factor(sample(c("spinal", "bulbar"), num_patients, replace = TRUE)), 
      age_at_onset = runif(num_patients, min = 40, max = 80), 
      progression_rate = runif(num_patients, min = 0, max = 8), 
      slow_vital_capacity = runif(num_patients, min = 0, max = 10), 
      pNFh = runif(num_patients, min = 0, max = 1000),
      center = as.factor(sample(c("munich", "goettingen"), num_patients, replace = TRUE))
    )
    
    #create extra categorical variable for age based on median
      m = median(clin$age)
      clin$age_cat = rep(NA, length(clin$age))
      clin$age_cat[clin$age>=m] = "over_59"
      clin$age_cat[clin$age<m] = "under_59"
      clin$age_cat = as.factor(clin$age_cat)
      
    #remove onset, pNFh, slow_vital_capacity, age_at_onset for ctrl patients
      clin[51:100,c("onset", "pNFh", "slow_vital_capacity", "age_at_onset")] = NA
      
    #make patient ids to rownames
      rownames(clin) = clin$patid
      
saveRDS(list(abundancy = abu_data,
             clinical_data = clin),
        file = "mock_data.rds")

Make summarized experiment

#make summarized experiments
#this is a complex data format that is required to use the functions from the DEP package
#how to work with this data format can be found online when looking up the package 'SummarizedExperiment'
      
      abu_data$name = abu_data$ID = rownames(abu_data)

      #make summarized experiment with all patients
      abundance.columns <- 1:num_patients # get abundance column numbers
      experimental.design = clin[,c("patid","disease", "onset", "age", "sex", "neurofilaments", "genetics", "age_at_onset", "progression_rate", "age_cat", "center", "slow_vital_capacity", "pNFh")] #selection of the clinical variables to include in the summarized experiment
      colnames(experimental.design) = c("label","condition","onset", "age", "sex", "neurofilaments", "genetics", "age_at_onset", "progression_rate", "age_cat", "center", "slow_vital_capacity", "pNFh") #rename the "patid" and "disease" to label and condition, which is needed to make it work
      experimental.design$replicate = 1:nrow(experimental.design)
      se_abu_data <- make_se(abu_data, abundance.columns, experimental.design) #construct the SE (summarized experiment)
      
      #make separate summarized experiment with only ALS patients
      #and onset as condition variable
      patids_ALS = clin$patid[clin$disease=="als"] #make selection of ALS patients
      abundance.columns <- grep("patient", colnames(abu_data[,c("name","ID",patids_ALS)])) # get abundance column numbers
      experimental.design = clin[patids_ALS, c("patid","onset", "age", "sex", "neurofilaments", "genetics", "age_at_onset", "progression_rate", "age_cat", "center")]
      colnames(experimental.design) = c("label","condition","age", "sex", "neurofilaments", "genetics", "age_at_onset", "progression_rate", "age_cat", "center") #here we take the onset as the condition
      experimental.design$replicate = 1:nrow(experimental.design)
      se_abu_data_ALS <- make_se(abu_data[,c("name","ID",patids_ALS)], abundance.columns, experimental.design)

Missing inspection

      #all patients
      vis_miss(as.data.frame(assay(se_abu_data)) ,show_perc = TRUE, show_perc_col = TRUE, cluster = T) #make a missing heatmap from the summarized experiment

      ggsave("plots/missing_vis_miss_heatmap_before.pdf", width = 11, height = 8, units = "in") #save the missing heatmap
      #filter proteins that are missing more than 20% in at least one condition
      se_abu_data_filtered = filter_missval(se_abu_data, thr = (0.2/2*ncol(assay(se_abu_data)))) #create a filtered summarized experiment, the 0.2 in the formula stands for the 20%
      vis_miss(as.data.frame(assay(se_abu_data_filtered)),show_perc = TRUE, show_perc_col = TRUE, cluster = T)

      ggsave("plots/missing_vis_miss_heatmap_after.pdf", width = 11, height = 8, units = "in")
      
      #only ALS patients
      vis_miss(as.data.frame(assay(se_abu_data_ALS)),show_perc = TRUE, show_perc_col = TRUE, cluster = T)

      ggsave("plots/missing_vis_miss_heatmap_before_ALS.pdf", width = 11, height = 8, units = "in")
      #filter values that are missing more than 20% in at least one condition
      se_abu_data_filtered_ALS = filter_missval(se_abu_data_ALS, thr = (0.2/2*ncol(assay(se_abu_data_ALS))))
      vis_miss(as.data.frame(assay(se_abu_data_filtered_ALS)),show_perc = TRUE, show_perc_col = TRUE, cluster = T)

      ggsave("plots/missing_vis_miss_heatmap_after_ALS.pdf", width = 11, height = 8, units = "in")
      
      #dimensions of the data, where the rows are the number of proteins and the columns the number of patients
      dim(se_abu_data) #all patients before filtering
## [1] 639 100
      dim(se_abu_data_filtered) #all patients after filtering
## [1] 530 100
      dim(se_abu_data_ALS) #only ALS before filtering
## [1] 639  50
      dim(se_abu_data_filtered_ALS) #only ALS after filtering
## [1] 555  50
      # % missing per patient:
      round(apply(X = as.data.frame(assay(se_abu_data)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(se_abu_data))) * 100 , 1)
##    als_1    als_2    als_3    als_4    als_5    als_6    als_7    als_8 
##     18.2     17.1     17.8     20.7     21.6     17.4     19.6     20.5 
##    als_9   als_10   als_11   als_12   als_13   als_14   als_15   als_16 
##     21.0     17.1     19.6     21.0     19.7     19.4     20.7     20.2 
##   als_17   als_18   als_19   als_20   als_21   als_22   als_23   als_24 
##     18.0     20.7     19.7     20.7     20.0     22.1     22.5     22.2 
##   als_25   als_26   als_27   als_28   als_29   als_30   als_31   als_32 
##     20.5     22.2     17.1     18.3     21.4     18.5     20.2     18.9 
##   als_33   als_34   als_35   als_36   als_37   als_38   als_39   als_40 
##     18.9     19.4     19.1     20.0     20.7     17.7     21.6     18.6 
##   als_41   als_42   als_43   als_44   als_45   als_46   als_47   als_48 
##     19.7     18.8     21.3     18.6     19.6     21.6     20.7     21.0 
##   als_49   als_50  ctrl_51  ctrl_52  ctrl_53  ctrl_54  ctrl_55  ctrl_56 
##     19.9     21.6     22.8     19.6     20.0     18.6     20.7     19.9 
##  ctrl_57  ctrl_58  ctrl_59  ctrl_60  ctrl_61  ctrl_62  ctrl_63  ctrl_64 
##     19.6     18.0     18.2     21.1     19.6     23.3     19.7     21.3 
##  ctrl_65  ctrl_66  ctrl_67  ctrl_68  ctrl_69  ctrl_70  ctrl_71  ctrl_72 
##     23.0     18.3     20.7     20.3     20.3     20.8     19.2     20.3 
##  ctrl_73  ctrl_74  ctrl_75  ctrl_76  ctrl_77  ctrl_78  ctrl_79  ctrl_80 
##     19.7     20.0     18.6     22.1     19.9     18.8     22.7     16.7 
##  ctrl_81  ctrl_82  ctrl_83  ctrl_84  ctrl_85  ctrl_86  ctrl_87  ctrl_88 
##     20.3     19.1     20.2     18.9     16.7     21.1     20.0     21.4 
##  ctrl_89  ctrl_90  ctrl_91  ctrl_92  ctrl_93  ctrl_94  ctrl_95  ctrl_96 
##     17.8     21.0     21.6     19.7     22.7     18.8     22.1     20.3 
##  ctrl_97  ctrl_98  ctrl_99 ctrl_100 
##     18.0     19.9     21.0     22.4
      round(apply(X = as.data.frame(assay(se_abu_data_filtered)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(se_abu_data))) * 100 , 1)
##    als_1    als_2    als_3    als_4    als_5    als_6    als_7    als_8 
##     14.6     14.2     13.6     17.1     16.4     13.5     14.9     16.6 
##    als_9   als_10   als_11   als_12   als_13   als_14   als_15   als_16 
##     16.9     13.1     16.3     15.8     15.8     16.0     17.2     16.3 
##   als_17   als_18   als_19   als_20   als_21   als_22   als_23   als_24 
##     13.8     16.0     14.4     16.0     16.4     17.7     17.2     16.3 
##   als_25   als_26   als_27   als_28   als_29   als_30   als_31   als_32 
##     15.6     16.9     13.6     14.2     16.0     14.6     16.3     14.7 
##   als_33   als_34   als_35   als_36   als_37   als_38   als_39   als_40 
##     14.6     14.2     15.8     16.0     15.6     13.3     17.4     14.9 
##   als_41   als_42   als_43   als_44   als_45   als_46   als_47   als_48 
##     15.5     15.3     16.9     14.7     15.8     17.2     16.3     16.0 
##   als_49   als_50  ctrl_51  ctrl_52  ctrl_53  ctrl_54  ctrl_55  ctrl_56 
##     14.6     16.4     18.6     15.8     17.1     14.7     17.2     15.8 
##  ctrl_57  ctrl_58  ctrl_59  ctrl_60  ctrl_61  ctrl_62  ctrl_63  ctrl_64 
##     15.0     14.2     13.1     15.8     15.3     18.8     16.4     16.7 
##  ctrl_65  ctrl_66  ctrl_67  ctrl_68  ctrl_69  ctrl_70  ctrl_71  ctrl_72 
##     18.2     14.1     16.0     14.9     17.7     17.2     16.0     15.6 
##  ctrl_73  ctrl_74  ctrl_75  ctrl_76  ctrl_77  ctrl_78  ctrl_79  ctrl_80 
##     15.2     14.6     14.2     16.3     17.7     15.0     17.5     13.1 
##  ctrl_81  ctrl_82  ctrl_83  ctrl_84  ctrl_85  ctrl_86  ctrl_87  ctrl_88 
##     16.3     14.4     15.3     15.2     12.7     16.4     15.6     16.4 
##  ctrl_89  ctrl_90  ctrl_91  ctrl_92  ctrl_93  ctrl_94  ctrl_95  ctrl_96 
##     15.2     15.2     16.7     15.3     18.2     14.2     16.3     16.0 
##  ctrl_97  ctrl_98  ctrl_99 ctrl_100 
##     13.0     17.4     15.6     17.1

Imputation and normalization

We use variance stabilization normalization for normalizing the data

Explanation of this:

The VSN method overcomes the limitations of log transformations by accommodating negative values and minimizing the inflated variance around low signal intensities. It calibrates between-feature variation through shifting and scaling mechanism in which all the data are adjusted. Huber et al.

      set.seed(9)
      
#performing normalization with vsn --> variance stabilization normaliztsion
      #all patients
      norm <- normalize_vsn(se_abu_data_filtered)
      meanSdPlot(norm)

      #only ALS
      norm_ALS <- normalize_vsn(se_abu_data_filtered_ALS)
      meanSdPlot(norm_ALS)

# imputation with several methods: MinProb, MAN, KNN
      #all patients
      norm_imp_MinProb <- impute(norm, fun = "MinProb", q=0.01)
## [1] 0.8568569
      norm_imp_man <- impute(norm, fun = "man", shift = 1.8, scale = 0.3)
      norm_imp_knn <- impute(norm, fun = "knn", rowmax = 0.9)
      
      #only ALS
      norm_imp_MinProb_ALS <- impute(norm_ALS, fun = "MinProb", q=0.01)
## [1] 0.9451965
      norm_imp_man_ALS <- impute(norm_ALS, fun = "man", shift = 1.8, scale = 0.3)
      norm_imp_knn_ALS <- impute(norm_ALS, fun = "knn", rowmax = 0.9)
      
      #put all the imputed dataset in a list called "data" and save it as an rds file in results
      data = list(imp_MinProb = norm_imp_MinProb, imp_man = norm_imp_man, imp_knn= norm_imp_knn,
                  imp_MinProb_ALS = norm_imp_MinProb_ALS, imp_man_ALS = norm_imp_man_ALS, imp_knn_ALS = norm_imp_knn_ALS)
      saveRDS(data, file = "results/summarized_experiments_imputed.rds")

Differential Expression analysis

#set the variables for the differential expression analysis
    covariates = c("no_cov", "age_cov", "sex_cov", "age_sex_cov") #a vector for the names of the covariates
    covariates_f = c(~0 + condition,  #the actual formulas to test with different combinations of covariates
                     ~0 + condition + age_cat, #because this function only takes categorical covariates i'm using the categorical age variable
                     ~0 + condition + sex, 
                     ~0 + condition + age_cat + sex)
    patients = c("all_patients", "only_female", "only_male") #names for the analysis without stratification, with only females, and with only males
    patients_f = c(NA, "Female", "Male") #the actual values that need to be used in the function to do the stratification
    res = list() #empty list to save the results

#loop to perform the different types of differential expression analysis    
      l = 1 
      for(k in 1:length(data)){ #a loop for all data types, so the different imputations as well as all testing ALS vs ctrl and spinal vs bulbar
        for(i in 1:length(covariates)){ #a loop for the different covariates
          for(j in 1:length(patients)){ #a loop for sex stratification and no sex stratification
          
             title = paste0(names(data)[k],"_",covariates[i], "_", patients[j]) #construct a title where all the varaibles are stored
              d = data[[k]] #select the k'th dataset
              if(j >1) { d = d[,d$sex == patients_f[j]] } #if j>1 we will apply sex stratification, and this line of code will select either only females or only males
              control = "ctrl" #set the name for the control so that the direction is correct
              if(k > 3){control = "bulbar"} #if we test spinal vs bulbar we want to set bulbar as the control group
              if(i < 3 | j == 1){ #this if() command is to prevent the differential expression analysis from running the test on spinal vs bulbar with sex stratification because the sample sizes are too small for this
                print(title)
                print(dim(d))
                t = test_diff(d, type = "control", control = control, #this function performs the tests and stores the results in 't'
                  test = NULL, design_formula = formula(covariates_f[[i]]))
                res[[l]] = as.data.frame(t@elementMetadata@listData) #the statistics are in @elementMetadata@listData and we will save this part of the results to our 'res' results list
                res[[l]]$fdr = p.adjust(res[[l]][,grep("p.val",colnames(res[[l]]))], method="BH") #we add an FDR column by using Benjamini Hochberg correction on the "p.val" column
                print(dim(res[[l]]))
                names(res)[l] = title
                write.csv(res[[l]], file = paste0("results/DEx", title, ".csv")) #write the results in a CSV file to the results folder. Each loop will create a CSV file
                l = l+1
                
              }}}}
## [1] "imp_MinProb_no_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_MinProb_no_cov_only_female"
## [1] 530  43
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_MinProb_no_cov_only_male"
## [1] 530  57
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_MinProb_age_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_MinProb_age_cov_only_female"
## [1] 530  43
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_MinProb_age_cov_only_male"
## [1] 530  57
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_MinProb_sex_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_MinProb_age_sex_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_man_no_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_man_no_cov_only_female"
## [1] 530  43
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_man_no_cov_only_male"
## [1] 530  57
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_man_age_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_man_age_cov_only_female"
## [1] 530  43
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_man_age_cov_only_male"
## [1] 530  57
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_man_sex_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_man_age_sex_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_knn_no_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_knn_no_cov_only_female"
## [1] 530  43
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_knn_no_cov_only_male"
## [1] 530  57
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_knn_age_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_knn_age_cov_only_female"
## [1] 530  43
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_knn_age_cov_only_male"
## [1] 530  57
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_knn_sex_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_knn_age_sex_cov_all_patients"
## [1] 530 100
## Tested contrasts: als_vs_ctrl
## [1] 530  10
## [1] "imp_MinProb_ALS_no_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_MinProb_ALS_no_cov_only_female"
## [1] 555  15
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_MinProb_ALS_no_cov_only_male"
## [1] 555  35
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_MinProb_ALS_age_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_MinProb_ALS_age_cov_only_female"
## [1] 555  15
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_MinProb_ALS_age_cov_only_male"
## [1] 555  35
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_MinProb_ALS_sex_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_MinProb_ALS_age_sex_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_man_ALS_no_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_man_ALS_no_cov_only_female"
## [1] 555  15
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_man_ALS_no_cov_only_male"
## [1] 555  35
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_man_ALS_age_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_man_ALS_age_cov_only_female"
## [1] 555  15
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_man_ALS_age_cov_only_male"
## [1] 555  35
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_man_ALS_sex_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_man_ALS_age_sex_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_knn_ALS_no_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_knn_ALS_no_cov_only_female"
## [1] 555  15
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_knn_ALS_no_cov_only_male"
## [1] 555  35
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_knn_ALS_age_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_knn_ALS_age_cov_only_female"
## [1] 555  15
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_knn_ALS_age_cov_only_male"
## [1] 555  35
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_knn_ALS_sex_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
## [1] "imp_knn_ALS_age_sex_cov_all_patients"
## [1] 555  50
## Tested contrasts: spinal_vs_bulbar
## [1] 555  10
      saveRDS(res, file = "results/DEx_results_all_in_list.rds")
      
      #save only minprob results into an excel
      res_MinProb = res[grep("MinProb", names(res))]
      names(res_MinProb) = gsub("imp_MinProb_", "", names(res_MinProb))
      library(writexl)
      write_xlsx(res_MinProb, path = "results/DEx_results_MinProb.xlsx")

Visualization 1: mean expressions boxplots

#visualize every dataset, also raw
data_all = list(raw = se_abu_data, #collect all not imputed datasets in a list in "data_all"
                filtered = se_abu_data_filtered, 
                normalized = norm,
                raw_ALS = se_abu_data_ALS, 
                filtered_ALS = se_abu_data_filtered_ALS, 
                normalized_ALS = norm_ALS)
data_all = c(data_all, data) #add the imputed data that is in the "data" list to the "data_all" list

mean_expression_plot = function(data, file_sample, file_mass){ #a function to make the boxplots for mean expression per protein and per patient
  ggplot(data = reshape2::melt(data), aes(x=Var1, y=value)) +
  geom_boxplot(color="darkseagreen4", fill="darkseagreen3") +
  theme_set(theme_minimal()) +
  theme_few() +
  scale_colour_few() +
  theme(legend.position = "none") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  theme(axis.text=element_text(size=6))

ggsave(file_sample, width = 11, height = 8, units = "in")

ggplot(data = reshape2::melt(data), aes(x=reorder(as.factor(Var2),value), y=value)) +
  geom_boxplot(color="darkseagreen4", fill="darkseagreen3") +
  theme_set(theme_minimal()) +
  theme_few() +
  scale_colour_few() +
  theme(legend.position = "none") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  theme(axis.text=element_text(size=6))

ggsave(file_mass, width = 11*2, height = 8, units = "in")
}

for(i in 1:length(data_all)){ #loop to make the boxplots for each dataset in the list "data_all" using the function from above
  mean_expression_plot(data = t(assay(data_all[[i]])), 
                      file_sample = paste0("plots/boxplots_expression_each_sample_",
                                            names(data_all)[i],
                                            ".pdf"),
                      file_mass = paste0("plots/boxplots_expression_each_protein_",
                                            names(data_all)[i],
                                            ".pdf"))
}
## Warning: Removed 12780 rows containing non-finite values (`stat_boxplot()`).
## Removed 12780 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 10024 rows containing non-finite values (`stat_boxplot()`).
## Removed 10024 rows containing non-finite values (`stat_boxplot()`).
## Removed 10024 rows containing non-finite values (`stat_boxplot()`).
## Removed 10024 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 6345 rows containing non-finite values (`stat_boxplot()`).
## Removed 6345 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 5179 rows containing non-finite values (`stat_boxplot()`).
## Removed 5179 rows containing non-finite values (`stat_boxplot()`).
## Removed 5179 rows containing non-finite values (`stat_boxplot()`).
## Removed 5179 rows containing non-finite values (`stat_boxplot()`).

Visualization 1b: Density plot

      #density plots of clinical variables
      d = as.data.frame(cbind(clin[,c("sex", "age_cat", "disease")], as.data.frame(t(assay(data_all[["normalized"]]))))) #use the normalized data and add the variables "sex", "age_cat", and "disease" to it
      long <- melt(setDT(d), id.vars = c("sex", "age_cat", "disease"), variable.name = "protein") #change the dataframe from wide to long
      
      #density plots for variables with and without missing
      d = as.data.frame(assay(data_all[["normalized"]])) #select the normalized, not-imputed dataset
      missing = apply(d, function(x) sum(is.na(x)) , MARGIN = 1) #count the missing for each column
      missing[missing>0] = "yes" #when there is missing, format to "yes"
      missing[missing == 0] = "no" #when there is no missing, format to "no"
      missing = as.factor(missing) 
      d2 = cbind(missing,d) #add the missing vector to the dataset
      long2 <- melt(setDT(d2), id.vars = "missing", variable.name = "protein") #change from wide to long
      
      #plot the 4 different density plots and save them in objects a, b, c, and d
      a = ggplot(long, aes(x=value, color=sex)) +
        geom_density() +
        theme_few() +
        scale_colour_few()
      b = ggplot(long, aes(x=value, color=age_cat)) +
        geom_density() +
        theme_few() +
        scale_colour_few()
      c = ggplot(long, aes(x=value, color=disease)) +
        geom_density() +
        theme_few() +
        scale_colour_few()
      d = ggplot(long2, aes(x=value, color=missing)) +
        geom_density() +
        theme_few() +
        scale_colour_few()
      
      library(ggpubr)
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
## 
##     get_legend
## The following object is masked from 'package:enrichplot':
## 
##     color_palette
      ggarrange(a,b,c,d, ncol = 2, nrow = 2) #place the figures in a grid
## Warning: Removed 10024 rows containing non-finite values (`stat_density()`).
## Removed 10024 rows containing non-finite values (`stat_density()`).
## Removed 10024 rows containing non-finite values (`stat_density()`).
## Removed 10024 rows containing non-finite values (`stat_density()`).

      ggsave(file = "plots/density.pdf", width = 11, height = 8, units = "in") #save the grid

Visualization 2: heatmaps

#this is the code for the non-paper heatmap, one code chunk below is a version that was altered to be used in the paper

set.seed(9)
#functions for saving the heatmaps as figures
        
        #a function that takes the pheatmap file and saves it as pdf
        save_pheatmap_pdf <- function(x, filename, width=11/2, height=8/2) {
           stopifnot(!missing(x))
           stopifnot(!missing(filename))
           pdf(filename, width=width, height=height)
           grid::grid.newpage()
           grid::grid.draw(x$gtable)
           dev.off()
        }
        
        #a function to make the heatmap with pre-set variables
        make_pheatmap <- function(data, cluster_cols = T, main = "Heatmap", clustering_method = "ward.D"){
          p = pheatmap::pheatmap(data, name = "expression", cutree_cols = 1,
                  show_colnames = T,
                  show_rownames = FALSE,
                  fontsize = 6,
                  fontsize_col = 3,
                  annotation_col = annotation,
                  annotation_colors = annotation_colours,
                  #color = viridis::viridis(100, option="G", direction = -1,),
                  color = final_colours$heatmap_scale,
                  main = main,
                  border_color=NA,
                  cluster_cols = cluster_cols,
                  clustering_method = clustering_method,
                  na_col = "grey50", 
                  cluster_rows = F)
          return(p)
        }
        
# all clustering methods:
        method = c("ward.D", "ward.D2"#, "single", "complete", "average" , "mcquitty", "median", "centroid"
                   ) #see hclust() for meaning of each method
        #earlier, we wanted to look at all possible different clustering methods for the heatmap, now we just want to look ad the ward.D ones

# loop for all datasets and all methods          
        for(i in 1:length(data)){ #a loop to make heatmaps for all imputed datasets
          for(j in 1:length(method)){ #a loop for all clustering methods
        title = paste0(names(data)[i], "_", method[j])  #construct a title that tracks where we are in the loops
        print(title)
      
        # get annotations and dataframe ready
        #annotations
        if(i<=3){ #i<=3 means when we are in the datasets that contain both ALS and control
        annotation = data.frame(group = as.factor(data[[i]]$condition), 
                                sex = as.factor(data[[i]]$sex), 
                                age = data[[i]]$age, 
                                onset = as.factor(data[[i]]$onset), 
                                neurofilaments = data[[i]]$neurofilaments,
                                genetics = data[[i]]$genetics, 
                                progression_rate = data[[i]]$progression_rate)
        rownames(annotation) = data[[i]]@colData$ID
        annotation_colours <- list(
          group = c(ctrl = "darkseagreen3", als = "darksalmon"), 
          sex = c(Female = "lightpink", Male ="lightblue3"), 
          age = c("white", "darkgreen"), 
          onset = c(ctrl = "grey50", spinal = "mediumpurple1", bulbar = "mediumaquamarine"),
          neurofilaments = c("white", "royalblue"),
          genetics = c(not_performed = "grey80", C9orf72 = "aquamarine4", negative = "salmon"),
          progression_rate = c("yellow", "red"))
        }
                if(i>3){ #i>3 means when we are in the datasets that contains only ALS, which requires different annotation
        annotation = data.frame(group = as.factor(data[[i]]$condition), 
                                sex = as.factor(data[[i]]$sex), 
                                age_at_onset = data[[i]]$age_at_onset, 
                                neurofilaments = data[[i]]$neurofilaments,
                                genetics = data[[i]]$genetics, 
                                progression_rate = data[[i]]$progression_rate)
        rownames(annotation) = data[[i]]@colData$ID
        annotation_colours <- list(
          group = c(spinal = "mediumpurple1", bulbar = "mediumaquamarine"),
          sex = c(Female = "lightpink", Male ="lightblue3"), 
          age_at_onset = c("white", "darkgreen"),
          neurofilaments = c("white", "royalblue"),
          genetics = c(not_performed = "grey80", C9orf72 = "aquamarine4", negative = "salmon"),
          progression_rate = c("yellow", "red"))
        }

#create heatmaps with all patients. There are three different heatmaps:
        
        #1. without grouping, all proteins
        p = make_pheatmap(data = assay(data[[i]]), 
                          cluster_cols = T, 
                          main = paste0("Heatmap all proteins\n",title, "\nclustered"), 
                          clustering_method = method[j])
        save_pheatmap_pdf(p, filename = paste0("plots/heatmap_clustered_",title,".pdf"))
        
        #2. without grouping, 100 most variable proteins
        d = assay(data[[i]])
        d2 = head(order(rowVars(d),decreasing = T),100)
        p = make_pheatmap(data = d[d2,], cluster_cols = T, main = paste0("Heatmap 100 most variable proteins\n",title, "\nclustered"), clustering_method = method[j])
        save_pheatmap_pdf(p, filename = paste0("plots/heatmap_clustered_mostvar_",title,".pdf"))
        save_pheatmap_pdf(p, filename = paste0("plots/heatmap_clustered_mostvar_",title,"_big.pdf"), width=11, height=8)
        
        #3. heatmap with only significant genes
        r = res[["imp_MinProb_no_cov_only_male"]]
        sig_met = r$name[r$fdr<0.1]
            if(length(sig_met)>2){
              sig_met2 = sig_met[sig_met %in% rownames(d)]
              d = d[sig_met2,]
              p = make_pheatmap(data = d, cluster_cols = T, main = paste0("Heatmap only significant proteins (FDR 0.1)\n",title, "\nclustered","\n using model imp_MinProb_no_cov_only_male" ), clustering_method = method[j])
              save_pheatmap_pdf(p, paste0("plots/heatmap_clustered_only_significant_",title,".pdf"))
            }}}
## [1] "imp_MinProb_ward.D"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_MinProb_ward.D2"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_man_ward.D"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_man_ward.D2"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_knn_ward.D"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_knn_ward.D2"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_MinProb_ALS_ward.D"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_MinProb_ALS_ward.D2"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_man_ALS_ward.D"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_man_ALS_ward.D2"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_knn_ALS_ward.D"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

## [1] "imp_knn_ALS_ward.D2"
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.

Visualization 2b: heatmap for paper

#this is the heatmap version suitable for the paper
#with the Lingor lab, the plan was to create a heatmap where we separate ALS and control
#unfortunately, this is not a setting the pheatmap function allows. 
#so we have to edit multiple heatmaps to get a version that we like
#to generate multiple heatmaps in multiple sizes, we can combine all elements with a figure editing program to get it exactly where we need it

set.seed(9)

#set ranges of values for the heatmap 
breaksList = seq(-3, 3, by = 0.1)

#functions for saving the heatmaps as figures
        
        #function to save the pheatmap object as a pdf file
        save_pheatmap_pdf <- function(x, filename, width=11/2, height=8/2) {
           stopifnot(!missing(x))
           stopifnot(!missing(filename))
           pdf(filename, width=width, height=height)
           grid::grid.newpage()
           grid::grid.draw(x$gtable)
           dev.off()
        }
        
        #function to make the heatmap with pre-set parameters
        make_pheatmap <- function(data, cluster_cols = T, main = "Heatmap", clustering_method = "ward.D"){
          p = pheatmap::pheatmap(data, name = "expression", cutree_cols = 1,
                  show_colnames = T,
                  show_rownames = FALSE,
                  fontsize = 6,
                  fontsize_col = 3,
                  annotation_col = annotation,
                  annotation_colors = annotation_colours,
                  #color = viridis::viridis(100, option="G", direction = -1,),
                  #color = final_colours$heatmap_scale,
                  colorRampPalette(final_colours$heatmap_scale)(length(breaksList)),
                  breaks = breaksList,
                  main = main,
                  border_color=NA,
                  cluster_cols = cluster_cols,
                  clustering_method = clustering_method,
                  na_col = "grey50", 
                  scale = "row",
                  cluster_rows = F)
          return(p)
        }
        
# all clustering methods:
        method = "ward.D2"
         
        i = "imp_MinProb"
      
        # get annotations and dataframe ready
        annotation = data.frame(disease = as.factor(data[[i]]$condition), 
                                onset = as.factor(data[[i]]$onset), 
                                sex = as.factor(data[[i]]$sex), 
                                age = data[[i]]$age, 
                                age_at_onset = data[[i]]$age_at_onset,
                                neurofilaments = data[[i]]$neurofilaments,
                                pNFh = data[[i]]$pNFh,
                                genetics = data[[i]]$genetics, 
                                progression_rate = data[[i]]$progression_rate,
                                slow_vital_capacity = data[[i]]$slow_vital_capacity)
        rownames(annotation) = data[[i]]@colData$ID
        annotation_colours <- list(
          disease = final_colours$disease_status, 
          sex = final_colours$male_female, 
          age = final_colours$age_scale, 
          age_at_onset = final_colours$age_at_onset_scale,
          onset = final_colours$onset,
          neurofilaments = final_colours$neurofilaments,
          pNFh = final_colours$pNFh_scale,
          genetics = final_colours$genetic_testing,
          progression_rate = final_colours$disease_progression_scale,
          slow_vital_capacity = final_colours$slow_vital_capacity_scale)
               
#create heatmaps with all patients
        
        title = "all_patients"
        
        #without grouping, all proteins
        p = make_pheatmap(data = assay(data[[i]]), 
                          cluster_cols = T, 
                          main = paste0("Heatmap all proteins\n",title, "\nclustered"), 
                          clustering_method = method) 

        #p = p + scale_fill_continuous(limits = c(15,40), breaks = c(15, 20, 25, 30, 35, 40))
        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_",title,".pdf"))
## quartz_off_screen 
##                 2
        # without grouping, 100 most variable proteins
        d = assay(data[[i]])
        d2 = head(order(rowVars(d),decreasing = T),100)
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.
        p = make_pheatmap(data = d[d2,], 
                          cluster_cols = T, 
                          main = paste0("Heatmap 100 most variable proteins\n",title, "\nclustered"), 
                          clustering_method = method)

        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_",title,".pdf")) #normal sized version
## quartz_off_screen 
##                 2
        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_",title,"_big.pdf"), width=11, height=8) #big version
## quartz_off_screen 
##                 2
#create heatmaps with ALS and control separately
        
        title = "ALS_patients"
        
        #all proteins
        d = assay(data[[i]])
        d = d[,grep("als", colnames(d))]
        
        p = make_pheatmap(data = d, 
                          cluster_cols = T, 
                          main = paste0("Heatmap all proteins\n",title, "\nclustered"), 
                          clustering_method = method)

        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_",title,".pdf"), width=3)
## quartz_off_screen 
##                 2
        # without grouping, 100 most variable proteins
        p = make_pheatmap(data = d[d2,], 
                          cluster_cols = T, 
                          main = paste0("Heatmap 100 most variable proteins\n",title, "\nclustered"), 
                          clustering_method = method)

        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_",title,".pdf"), width=3)
## quartz_off_screen 
##                 2
        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_",title,"_big.pdf"), width=11, height=8)
## quartz_off_screen 
##                 2
        #select only ctrl
        title = "ctrl_patients"
        
        #all proteins
        d = assay(data[[i]])
        d = d[,grep("ctrl", colnames(d))]
        
        annotation = annotation[,!colnames(annotation) == "progression_rate"]
        annotation = annotation[,!colnames(annotation) == "pNFh"]
        annotation = annotation[,!colnames(annotation) == "slow_vital_capacity"]
        annotation = annotation[,!colnames(annotation) == "age_at_onset"]
        
        p = make_pheatmap(data = d, 
                          cluster_cols = T, 
                          main = paste0("Heatmap all proteins\n",title, "\nclustered"), 
                          clustering_method = method)

        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_",title,".pdf"), width=3)
## quartz_off_screen 
##                 2
        # without grouping, 100 most variable proteins
        p = make_pheatmap(data = d[d2,], 
                          cluster_cols = T, 
                          main = paste0("Heatmap 100 most variable proteins\n",title, "\nclustered"), 
                          clustering_method = method)

        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_",title,".pdf"), width=3)
## quartz_off_screen 
##                 2
        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_",title,"_big.pdf"), width=11, height=8)
## quartz_off_screen 
##                 2
#heatmaps with no annotations
        
        annotation = NULL
        
        #create heatmaps with ALS and control separately
        
        title = "ALS_patients"
        
        #all proteins
        d = assay(data[[i]])
        d = d[,grep("als", colnames(d))]
        
        p = make_pheatmap(data = d, 
                          cluster_cols = T, 
                          main = paste0("Heatmap all proteins\n",title, "\nclustered"), 
                          clustering_method = method)

        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_no_ann_",title,".pdf"), width=3)
## quartz_off_screen 
##                 2
        # without grouping, 100 most variable proteins
        p = make_pheatmap(data = d[d2,], 
                          cluster_cols = T, 
                          main = paste0("Heatmap 100 most variable proteins\n",title, "\nclustered"), 
                          clustering_method = method)

        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_no_ann_",title,".pdf"), width=3)
## quartz_off_screen 
##                 2
        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_no_ann_",title,"_big.pdf"), width=11, height=8)
## quartz_off_screen 
##                 2
        #select only ctrl
        title = "ctrl_patients"
        
        #all proteins
        d = assay(data[[i]])
        d = d[,grep("ctrl", colnames(d))]
        
        p = make_pheatmap(data = d, 
                          cluster_cols = T, 
                          main = paste0("Heatmap all proteins\n",title, "\nclustered"), 
                          clustering_method = method)

        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_no_ann_",title,".pdf"), width=3)
## quartz_off_screen 
##                 2
        # without grouping, 100 most variable proteins
        p = make_pheatmap(data = d[d2,], 
                          cluster_cols = T, 
                          main = paste0("Heatmap 100 most variable proteins\n",title, "\nclustered"), 
                          clustering_method = method)

        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_no_ann_",title,".pdf"), width=3)
## quartz_off_screen 
##                 2
        save_pheatmap_pdf(p, filename = paste0("plots/paper/heatmap_clustered_mostvar_no_ann_",title,"_big.pdf"), width=11, height=8)
## quartz_off_screen 
##                 2

Visualization 3: UMAP plots

#these UMAP plots are not the ones altered for the paper. The ones altered for the paper can be found in the next code chunk.

# set seed for reproducible results
set.seed(9)

#set the colours for the UMAP plots
          group = c("darksalmon","darkseagreen3")
          sex = c("lightpink", "lightblue3")
          onset = c("mediumaquamarine", "mediumpurple1","grey80")
          onset_ALS = c("mediumaquamarine", "mediumpurple1")
          age_cat = c("darkgreen", "lightgreen")
          center = c("blue3", "yellow4")

        
#THE FUNCTION 
#here, I make a function to make a umap plot. This has all the parameters of the figure pre-set

UMAP_density_plot = function(data, 
                             ggtitle = "UMAP with disease status labels", 
                             legend_name = "Disease status", 
                             labels = clin$Condition, 
                             file_location = "plots/UMAP_condition.pdf", 
                             file_location_labels = "plots/UMAP_condition_labels.pdf",
                             colour_set = c("seagreen4", "slateblue1", "salmon"), 
                             shape = rep(16, nrow(umap_plot)), 
                             shapeTF = F){
      # run umap function
      umap_out = umap::umap(data)
      umap_plot = as.data.frame(umap_out$layout)
      
      #add condition labels
      umap_plot$group = labels

      # plot umap
      p1 = ggplot(umap_plot) + geom_point(aes(x=V1, y=V2, color = as.factor(group),), shape=shape) +
        ggtitle(ggtitle) +
          theme_few() +
          scale_colour_few() +
          scale_color_manual(name = legend_name, 
                           labels = levels(as.factor(umap_plot$group)), 
                           values = colour_set) + 
          scale_fill_manual(values=colour_set)  

      #add shape argument if we want to change shapes
      if(shapeTF){
        p1 = p1 + scale_shape_manual(name = "Sex", 
                    labels = levels(as.factor(shape)), 
                    values=c(15, 17))
      }
  
      xdens <- 
        axis_canvas(p1, axis = "x") + 
        geom_density(data = umap_plot, aes(x = V1, fill = group, colour = group), alpha = 0.3) +
        scale_fill_manual( values = colour_set) + 
        scale_colour_manual( values = colour_set)
      ydens <-
        axis_canvas(p1, axis = "y", coord_flip = TRUE) + 
        geom_density(data = umap_plot, aes(x = V2, fill = group, colour = group), alpha = 0.3) +
        coord_flip() +
        scale_fill_manual(values = colour_set) + 
        scale_colour_manual( values = colour_set)
      p1 %>%
        insert_xaxis_grob(xdens, grid::unit(1, "in"), position = "top") %>%
        insert_yaxis_grob(ydens, grid::unit(1, "in"), position = "right") %>%
        ggdraw()
      
      p1
      # save umap
      ggsave(file_location, width = 11/2, height = 8/2, units = "in")
      
      p1 + geom_text(label = rownames(umap_plot), x = umap_plot$V1, y = umap_plot$V2,
                     hjust = 0, nudge_x = 1, size = 1.5, colour = "grey")
      
      # save umap with labels
      ggsave(file_location_labels, width = 11/2, height = 8/2, units = "in")
}


#a loop to plot the UMAPs for all imputed datasets I have 
for(i in 1:length(data)){
  d = t(assay(data[[i]])) #extract abundancy data from the SE
  labels_disease = data[[i]]$condition #extract disease labels
  if(i<4){labels_onset = data[[i]]$onset} #only run this one when we have case & control
  labels_sex = data[[i]]$sex
  labels_age = data[[i]]$age_cat
  labels_center = data[[i]]$center
  title = names(data)[i] #use name of the dataset as the title
  if(i>3){group = onset_ALS} #change the colour labels when switching from case & control to only ALS
      
#perform plots with function
        #plot disease
        UMAP_density_plot(data = d, 
                          ggtitle = paste0("UMAP with disease status labels\n", title), 
                          legend_name = "Disease status", 
                          labels = labels_disease, 
                          file_location = paste0("plots/UMAP_condition_",title,".pdf"),
                          file_location_labels = paste0("plots/UMAP_condition_labels_",title,".pdf"),
                          colour_set = group)
        #disease color AND sex label
         UMAP_density_plot(data = d, 
                          ggtitle = paste0("UMAP with disease status labels & sex shape\n", title), 
                          legend_name = "Disease status", 
                          labels = labels_disease, 
                          file_location = paste0("plots/UMAP_condition_sex_shape",title,".pdf"),
                          file_location_labels = paste0("plots/UMAP_condition_labels_sex_shape",title,".pdf"),
                          colour_set = group,
                          shape = labels_sex,
                          shapeTF = T)
        #plot center
          UMAP_density_plot(data = d, 
                          ggtitle = paste0("UMAP with center labels\n", title), 
                          legend_name = "Disease status", 
                          labels = labels_center, 
                          file_location = paste0("plots/UMAP_center_",title,".pdf"),
                          file_location_labels = paste0("plots/UMAP_center_labels_",title,".pdf"),
                          colour_set = center)
         
        if(i<4){ #only run this one when we have case & control
          #plot onset
          UMAP_density_plot(data = d, 
                          ggtitle = paste0("UMAP with onset status labels\n", title), 
                          legend_name = "Onset labels", 
                          labels = labels_onset, 
                          file_location = paste0("plots/UMAP_onset_",title,".pdf"), 
                          file_location_labels = paste0("plots/UMAP_onset_labels_",title,".pdf"),
                          colour_set = onset)
        }
        
        #plot sex
        UMAP_density_plot(data = d, 
                          ggtitle = paste0("UMAP with sex labels\n", title), 
                          legend_name = "Sex label", 
                          labels = labels_sex, 
                          file_location = paste0("plots/UMAP_sex_",title,".pdf"), 
                          file_location_labels = paste0("plots/UMAP_sex_labels_",title,".pdf"), 
                          colour_set = sex)
        #plot age categorical
        UMAP_density_plot(data = d, 
                          ggtitle = paste0("UMAP with age labels\n", title), 
                          legend_name = "Age label", 
                          labels = labels_age, 
                          file_location = paste0("plots/UMAP_age_",title,".pdf"), 
                          file_location_labels = paste0("plots/UMAP_age_labels_",title,".pdf"), 
                          colour_set = age_cat)
        
#perform plots with only most variable proteins      
        d2 = head(order(colVars(d),decreasing = T),100) #d2 is a vector with the names of only the 100 most variable proteins
        
        #now we rerun all the plots, this time using this reduced table
        
        UMAP_density_plot(data = d[,d2], 
                          ggtitle = paste0("UMAP with disease status labels\n", title, "\nwith 100 most variable proteins"), 
                          legend_name = "Disease status", 
                          labels = labels_disease, 
                          file_location = paste0("plots/UMAP_mostvar_condition_",title,".pdf"),
                          file_location_labels = paste0("plots/UMAP_mostvar_condition_labels_",title,".pdf"),
                          colour_set = group)
        
          #disease color AND sex label
         UMAP_density_plot(data = d[,d2], 
                          ggtitle = paste0("UMAP with disease status labels & sex shape\n", title, "\nwith 100 most variable proteins"), 
                          legend_name = "Disease status", 
                          labels = labels_disease, 
                          file_location = paste0("plots/UMAP_mostvar_condition_sex_shape",title,".pdf"),
                          file_location_labels = paste0("plots/UMAP_mostvar_condition_labels_sex_shape",title,".pdf"),
                          colour_set = group,
                          shape = labels_sex,
                          shapeTF = T)
        
        if(i<4){ #only run this one when we have case & control
          UMAP_density_plot(data = d[,d2], 
                            ggtitle = paste0("UMAP with onset status labels\n", title, "\nwith 100 most variable proteins"), 
                            legend_name = "Onset labels", 
                            labels = labels_onset, 
                            file_location = paste0("plots/UMAP_mostvar_onset_",title,".pdf"), 
                            file_location_labels = paste0("plots/UMAP_mostvar_onset_labels_",title,".pdf"),
                            colour_set = onset)}
        
        UMAP_density_plot(data = d[,d2], 
                          ggtitle = paste0("UMAP with sex labels\n", title, "\nwith 100 most variable proteins"), 
                          legend_name = "Sex label", 
                          labels = labels_sex, 
                          file_location = paste0("plots/UMAP_mostvar_sex_",title,".pdf"), 
                          file_location_labels = paste0("plots/UMAP_mostvar_sex_labels_",title,".pdf"), 
                          colour_set = sex)
        
        UMAP_density_plot(data = d[,d2], 
                          ggtitle = paste0("UMAP with age labels\n", title, "\nwith 100 most variable proteins"), 
                          legend_name = "Age label", 
                          labels = labels_age, 
                          file_location = paste0("plots/UMAP_mostvar_age_",title,".pdf"), 
                          file_location_labels = paste0("plots/UMAP_mostvar_age_labels_",title,".pdf"), 
                          colour_set = age_cat)
        
        UMAP_density_plot(data = d[,d2], 
                          ggtitle = paste0("UMAP with center labels\n", title, "\nwith 100 most variable proteins"), 
                          legend_name = "Center label", 
                          labels = labels_center, 
                          file_location = paste0("plots/UMAP_mostvar_center_",title,".pdf"), 
                          file_location_labels = paste0("plots/UMAP_mostvar_center_labels_",title,".pdf"), 
                          colour_set = center)
}
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

Visualization 3b: UMAP plots for paper

#these are the final plots that could be used in the paper
#they use the colour palette defined at the start of this file, and have more pleasent size


# set seed for reproducible results
set.seed(9)

#THE FUNCTION 

UMAP_density_plot = function(data, 
                             ggtitle = "UMAP with disease status labels", 
                             legend_name = "Disease status", 
                             labels = clin$Condition, 
                             file_location = "plots/UMAP_condition.pdf", 
                             file_location_labels = "plots/UMAP_condition_labels.pdf",
                             colour_set = c("seagreen4", "slateblue1", "salmon"), 
                             shape = rep(16, nrow(umap_plot)), 
                             shapeTF = F){
      # run umap function
      umap_out = umap::umap(data)
      umap_plot = as.data.frame(umap_out$layout)
      
      #add condition labels
      umap_plot$group = labels

      # plot umap
      p1 = ggplot(umap_plot) + 
        geom_point(aes(x=V1, y=V2, color = as.factor(group),), shape=shape, alpha = 0.75, size = 4) +
        ggtitle(ggtitle) +
          theme_few() +
          scale_colour_few() +
          scale_color_manual(name = legend_name, 
                           labels = levels(as.factor(umap_plot$group)), 
                           values = colour_set) + 
          scale_fill_manual(values=colour_set) +
        labs(x = "UMAP1", y = "UMAP2")

      #add shape argument if we want to change shapes
      if(shapeTF){
        p1 = p1 + scale_shape_manual(name = "Sex", 
                    labels = levels(as.factor(shape)), 
                    values=c(15, 17))
      }
  
      xdens <- 
        axis_canvas(p1, axis = "x") + 
        geom_density(data = umap_plot, aes(x = V1, fill = group, colour = group), alpha = 0.3) +
        scale_fill_manual( values = colour_set) + 
        scale_colour_manual( values = colour_set)
      ydens <-
        axis_canvas(p1, axis = "y", coord_flip = TRUE) + 
        geom_density(data = umap_plot, aes(x = V2, fill = group, colour = group), alpha = 0.3) +
        coord_flip() +
        scale_fill_manual(values = colour_set) + 
        scale_colour_manual( values = colour_set)
      p1 %>%
        insert_xaxis_grob(xdens, grid::unit(1, "in"), position = "top") %>%
        insert_yaxis_grob(ydens, grid::unit(1, "in"), position = "right") %>%
        ggdraw()
      
      p1
      # save umap
      ggsave(file_location, width = 11/2, height = 8/2, units = "in")
      
      p1 + geom_text(label = rownames(umap_plot), x = umap_plot$V1, y = umap_plot$V2,
                     hjust = 0, nudge_x = 1, size = 1.5, colour = "grey")
      
      # save umap with labels
      ggsave(file_location_labels, width = 11/2, height = 8/2, units = "in")
}

#set the parameters
  d = t(assay(data[["imp_MinProb"]])) #we only make plots for the MinProb imputed data
  i = "imp_MinProb"
  labels_disease = data[[i]]$condition
  labels_onset = data[[i]]$onset
  labels_sex = data[[i]]$sex
  labels_age = data[[i]]$age_cat
  labels_center = data[[i]]$center
      
#perform plots with function      
        UMAP_density_plot(data = d, 
                          ggtitle = "UMAP with disease status labels", 
                          legend_name = "Disease status", 
                          labels = labels_disease, 
                          file_location = "plots/paper/UMAP_condition.pdf",
                          file_location_labels = "plots/paper/UMAP_condition_labels.pdf",
                          colour_set = final_colours$disease_status)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
        #disease color AND sex label
         UMAP_density_plot(data = d, 
                          ggtitle = "UMAP with disease status labels & sex shape", 
                          legend_name = "Disease status", 
                          labels = labels_disease, 
                          file_location = "plots/paper/UMAP_condition_sex_shape.pdf",
                          file_location_labels = "plots/paper/UMAP_condition_labels_sex_shape.pdf",
                          colour_set = final_colours$disease_status,
                          shape = labels_sex,
                          shapeTF = T)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
        #center labels
          UMAP_density_plot(data = d, 
                          ggtitle = "UMAP with center labels", 
                          legend_name = "Center", 
                          labels = labels_center, 
                          file_location = "plots/paper/UMAP_center.pdf",
                          file_location_labels = "plots/paper/UMAP_center_labels.pdf",
                          colour_set = final_colours$center)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
        #onset labels
          UMAP_density_plot(data = d, 
                          ggtitle = "UMAP with onset status labels", 
                          legend_name = "Onset labels", 
                          labels = labels_onset, 
                          file_location = "plots/paper/UMAP_onset.pdf", 
                          file_location_labels = "plots/paper/UMAP_onset_labels.pdf",
                          colour_set = final_colours$onset)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
        UMAP_density_plot(data = d, 
                          ggtitle = "UMAP with sex labels", 
                          legend_name = "Sex label", 
                          labels = labels_sex, 
                          file_location = "plots/paper/UMAP_sex.pdf", 
                          file_location_labels = "plots/paper/UMAP_sex_labels_.pdf", 
                          colour_set = final_colours$male_female)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
        UMAP_density_plot(data = d, 
                          ggtitle = "UMAP with age labels", 
                          legend_name = "Age label", 
                          labels = labels_age, 
                          file_location = "plots/paper/UMAP_age.pdf", 
                          file_location_labels = "plots/paper/UMAP_age_labels.pdf", 
                          colour_set = final_colours$age_scale[c(4,7)])
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

Visualization 4: volcano plots

#these are not the volcano plots that were improved to be used in the paper, these can be found in the chunk below

#the function for the volcano plot:
    
    volcano_plot <- function(data_res, alpha_sig, name_title){
      logFC = data_res[,grep("diff",colnames(data_res))]
      fdr = data_res$fdr
      df <- data.frame(x = logFC, 
                       y = -log10(fdr),
                       name = data_res$name)
      names(df) <- c("x","y","name")
      df <- df %>%
        mutate(omic_type = case_when(x >= 0 & y >= (-log10(alpha_sig)) ~ "up",
                                     x <= (0) & y >= (-log10(alpha_sig)) ~ "down",
                                     TRUE ~ "ns")) 
      cols <- c("up" = "#d4552b", "down" = "#26b3ff", "ns" = "grey") 
      sizes <- c("up" = 2, "down" = 2, "ns" = 1) 
      alphas <- c("up" = 0.7, "down" = 0.7, "ns" = 0.5)
      ggplot(data = df, aes(x,y)) + 
        geom_point(aes(colour = omic_type), 
                   alpha = 0.5, 
                   shape = 16,
                   size = 3) + 
        geom_hline(yintercept = -log10(alpha_sig),
                   linetype = "dashed") + 
        geom_vline(xintercept = 0,linetype = "dashed") +
        geom_point(data = filter(df, y >= (-log10(alpha_sig))),
                   aes(colour = omic_type), 
                   alpha = 0.5, 
                   shape = 16,
                   size = 4) + 
        #annotate(geom="text", x=-1.9, y= (-log10(alpha_sig)) + 0.15, label="FDR = 10%",size = 5) +
        geom_text_repel(data = filter(df, y >= (-log10(alpha_sig)) & y > 0),
                         aes(label = name),
                         force = 1,
                        hjust = 1,
                         nudge_x = - 0.3,
                        nudge_y = 0.1,
                        #direction = "x",
                         max.overlaps = 5,
                        segment.size = 0.2,
                         size = 4) +
        geom_text_repel(data = filter(df, y >= (-log10(alpha_sig)) & y < 0),
                        aes(label = name),
                        force = 1,
                        hjust = 0,
                        nudge_x = 0.3,
                        nudge_y = 0.1,
                        #direction = "y",
                        max.overlaps = 5,
                        size = 4) +
        scale_colour_manual(values = cols) + 
        scale_fill_manual(values = cols) + 
        scale_x_continuous(expand = c(0, 0), 
                           limits = c(-3, 3)) + 
        scale_y_continuous(expand = c(0, 0), limits = c(-0.1, NA)) +
        labs(title = name_title,
             x = "log2(fold change)",
             y = expression(-log[10] ~ "(adjusted p-value)"),
             colour = "Differential \nExpression") +
        theme_classic() + # Select theme with a white background  
        theme(axis.title.y = element_text(size = 14),
              axis.title.x = element_text(size = 14),
              axis.text = element_text(size = 12),
              plot.title = element_text(size = 15, hjust = 0.5),
              text = element_text(size = 14)) +
        annotate("text", x = 2, y = 0, label = paste0(sum(df$omic_type=="up"), " more abundant \n", sum(df$omic_type=="down"), " less abundant"))
    }

#a loop to make a volcano plot for all results that we had from the differential expression analysis
    for(i in 1:length(res)){
        volcano_plot(res[[i]], 0.05 , paste0("Volcano plot proteomics \nalpha = FDR 0.05\n", names(res)[i]))
        ggsave(paste0("plots/volcano_plot_", names(res)[i], "_FDR0.05.pdf"), 
                         width = 11, height = 8, units = "in")
        volcano_plot(res[[i]], 0.1 , paste0("Volcano plot proteomics \nalpha = FDR 0.1\n", names(res)[i]))
        ggsave(paste0("plots/volcano_plot_", names(res)[i], "_FDR0.1.pdf"), 
                         width = 11, height = 8, units = "in")
    }

Visualization 4b: volcano plots for paper

#when we have decided for labels we could write them like this:
#labels = c("PROT1", "PROT2", "PROT3")

#PLOT OF MINPROB, ALS vs CTRL, NOT STRATIFIED, age and sex corrected

#first the function for the volcano plot
volcano_plot <- function(data_res, alpha_sig, name_title, labels){
  logFC = data_res[,grep("diff",colnames(data_res))]
  fdr = data_res$fdr
  df <- data.frame(x = logFC, 
                   y = -log10(fdr),
                   name = data_res$name)
  names(df) <- c("x","y","name")
  df <- df %>%
    mutate(omic_type = case_when(x >= 0 & y >= (-log10(alpha_sig)) ~ "up",
                                 x <= (0) & y >= (-log10(alpha_sig)) ~ "down",
                                 TRUE ~ "ns")) 
  cols <- c("up" = "#D73027", "down" = "#D73027", "ns" = "grey") #use the final_colours$disease status als for this
  sizes <- c("up" = 2, "down" = 2, "ns" = 1) 
  alphas <- c("up" = 0.7, "down" = 0.7, "ns" = 0.5)
  ggplot(data = df, aes(x,y)) + 
    geom_point(aes(colour = omic_type), 
               alpha = 0.5, 
               shape = 16,
               size = 2) + 
    geom_hline(yintercept = -log10(alpha_sig),
               linetype = "dashed") + 
    geom_vline(xintercept = 0,linetype = "dashed") +
    geom_point(data = filter(df, y >= (-log10(alpha_sig))),
               aes(colour = omic_type), 
               alpha = 0.5, 
               shape = 16,
               size = 4) + 
    #annotate(geom="text", x=-1.9, y= (-log10(alpha_sig)) + 0.15, label="FDR = 10%",size = 5) +
    geom_text_repel(data = filter(df, y >= (-log10(alpha_sig)) & y > 0 & name %in% labels),
                     aes(label = name),
                     force = 1,
                    hjust = 1,
                    nudge_x = -0.3,
                    nudge_y = 1.5,
                    direction = "both",
                    max.overlaps = 5,
                     size = 4) +
    geom_text_repel(data = filter(df, y >= (-log10(alpha_sig)) & y < 0 & name %in% labels),
                    aes(label = name),
                    force = 1,
                    hjust = 0,
                    nudge_x = 0.3,
                    nudge_y = 1.5,
                    direction = "both",
                    max.overlaps = 5,
                    size = 4) +
    scale_colour_manual(values = cols) + 
    scale_fill_manual(values = cols) + 
    scale_x_continuous(expand = expansion(mult = .05), 
                       limits = c(-3, 3)) + 
    scale_y_continuous(expand = expansion(mult = .05), limits = c(-0.1, NA)) +
    labs(title = name_title,
         x = "log2(fold change)",
         y = expression(-log[10] ~ "(adjusted p-value)"),
         colour = "Differential \nAbundancy") +
    theme_few() + # Select theme with a white background  
    theme(axis.title.y = element_text(size = 14),
          axis.title.x = element_text(size = 14),
          axis.text = element_text(size = 12),
          plot.title = element_text(size = 15, hjust = 0.5),
          text = element_text(size = 14)) +
    annotate("text", x = 2, y = 0.5, label = paste0(sum(df$omic_type=="up"), " more abundant \n", sum(df$omic_type=="down"), " less abundant"))
}
    

    plots = list() #initialize a list to save all the plots
    labels = list() #initialize a list for the labels
    
    #make labels for the plots
    
    names_dfs = c("imp_MinProb_age_sex_cov_all_patients", 
                  "imp_MinProb_age_cov_only_female",
                  "imp_MinProb_age_cov_only_male")
    
    for(i in 1:length(names_dfs)){
      # Filter, arrange, and select top 5 names with positive fold change and lowest p-value
      top_5_positive <- res[[names_dfs[i]]] %>%
        filter(als_vs_ctrl_diff > 0) %>%
        arrange(fdr) %>%
        slice_head(n = 5) %>%
        pull(name)
      
      # Filter, arrange, and select top 5 names with negative fold change and lowest p-value
      top_5_negative <- res[[names_dfs[i]]] %>%
        filter(als_vs_ctrl_diff < 0) %>%
        arrange(fdr) %>%
        slice_head(n = 5) %>%
        pull(name)
      
      # Concatenate the two vectors
      labels[[i]]  = c(top_5_positive, top_5_negative)
      names(labels)[i] = names_dfs[i]
    }
    

    plots[[1]] = volcano_plot(res[["imp_MinProb_age_sex_cov_all_patients"]], 
                              0.05, #first plot with FDR 0.05
                              "Volcano plot proteomics \nalpha = FDR 0.05\nall patients, age & sex cov, imp MinProb",
                              labels = labels$imp_MinProb_age_sex_cov_all_patients) 
    plots[[2]] = volcano_plot(res[["imp_MinProb_age_sex_cov_all_patients"]], 
                              0.1 , #second plot with FDR 0.01
                              "Volcano plot proteomics \nalpha = FDR 0.1\nall patients, age & sex cov, imp MinProb",
                              labels = labels$imp_MinProb_age_sex_cov_all_patients) 
    plots[[3]] = volcano_plot(res[["imp_MinProb_age_cov_only_female"]], 
                              0.05 , 
                              "Volcano plot proteomics \nalpha = FDR 0.05\nonly female, age cov, imp MinProb",
                              labels = labels$imp_MinProb_age_cov_only_female) 
    plots[[4]] = volcano_plot(res[["imp_MinProb_age_cov_only_male"]], 
                              0.05 , 
                              "Volcano plot proteomics \nalpha = FDR 0.05\nonly male, age cov, imp MinProb",
                              labels = labels$imp_MinProb_age_cov_only_male)
    
    ggarrange(plotlist = plots, ncol = 2, nrow = 2)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

    ggsave("plots/paper/volcano_plots_strat_and_notstrat_age_sex_cov.pdf", 
                     width = 11, height = 8, units = "in")

Visualization 4c: scatterplot sex stratified DEx results, for paper

#this is a function for a scatterplot that plots the log10(FDR) for the stratified DEx results
#with this plot, we can visualize how the proteins are different up- or down- regulated for male and female

scatterplot_FDR_male_female = function(data,
                                       x_name = "females", 
                                       y_name = "males", 
                                       cut_off = -log10(0.05),
                                       main_title, 
                                       labels,
                                       max.overlaps = 10, 
                                       lab_x = "signed -log10(FDR) for females",
                                       lab_y = "signed -log10(FDR) for males",
                                       labels_T_F = T){
  data$omic_type = rep("ns", nrow(data))
  text_y = paste0("significant in ", y_name)
  text_x = paste0("significant in ", x_name)
  data$omic_type[abs(data$y) >= cut_off] = text_y
  data$omic_type[abs(data$x) >= cut_off] = text_x
  data$omic_type[(abs(data$x) >= cut_off) & (abs(data$y) >= cut_off)] = "significant in both"
  cols <- c("x" = "#E78AC3", "y" = "#66C2A5", "ns" = "grey", "significant in both" = "mediumpurple1") 
  attributes(cols)$names[1] = text_x
  attributes(cols)$names[2] = text_y

 plot = ggplot(data, aes(x,y)) +
  geom_point(aes(colour = omic_type),
               alpha = 0.5,
               shape = 16,
               size = 2) +
  geom_point(data = filter(data, abs(y) >= cut_off | abs(x) >= cut_off),
               aes(colour = omic_type), 
               alpha = 0.5, 
               shape = 16,
               size = 3) + 
  geom_hline(yintercept = cut_off, linetype = "dashed", colour = "grey40") +
  geom_hline(yintercept = -cut_off, linetype = "dashed", colour = "grey40") +
  geom_vline(xintercept = cut_off, linetype = "dashed", colour = "grey40") +
  geom_vline(xintercept = -cut_off, linetype = "dashed", colour = "grey40") +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey80") +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey80") +
     geom_text_repel(data = filter(data, name %in% labels),
                     aes(label = name),
                     force = 1,
                    hjust = 1,
                     nudge_x = - 0.05,
                    nudge_y = 0.05,
                    #direction = "y",
                    max.overlaps = max.overlaps,
                    segment.size = 0.2,
                     size = 2)  +
    scale_colour_manual(values = cols) + 
    scale_fill_manual(values = cols) +
  labs(title = main_title,
         x = lab_x,
         y = lab_y,
         colour = "Differential \nExpression") +
    theme_few() + # Select theme with a white background  
    # theme(axis.title.y = element_text(size = 10),
    #       axis.title.x = element_text(size = 10),
    #       axis.text = element_text(size = 8),
    #       plot.title = element_text(size = 10, hjust = 0.5),
    #       text = element_text(size = 10)) +
    annotate("text", x = -0.5, y = 3, label = 
               paste0(sum(data$omic_type==text_y), " ", text_y,"\n", 
                      sum(data$omic_type==text_x), " ", text_x,"\n", 
                      sum(data$omic_type=="significant in both"), " significant in both"))

  return(plot)

}

#plot results DE - male-female

  res_f = res$imp_MinProb_age_cov_only_female #results for the females
  res_m = res$imp_MinProb_age_cov_only_male #results for the males

  title = "MinProb_age_cov"
  logfc_name = colnames(res_m)[grep("diff", colnames(res_m))] #take the name in the table that represents the fold change
  df_m = res_m[,c("name", logfc_name, "fdr")] #select three columns: the protein name colum, the log fold change column, and the FDR column
  df_f = res_f[,c("name", logfc_name, "fdr")]

  colnames(df_m) = colnames(df_f) = c("name", "logFC", "FDR") #adjust the column names
  df_merge <- merge(df_m, df_f, by = "name", all = TRUE) #merge the male and female dataframe
  df_merge$signed_minlogFDR_male = -log10(df_merge$FDR.x) #create a minlog10(FDR)
  df_merge$signed_minlogFDR_female = -log10(df_merge$FDR.y) #create a minlog10(FDR)
  df_merge$signed_minlogFDR_male[df_merge$logFC.x<0] = -df_merge$signed_minlogFDR_male[df_merge$logFC.x<0] #make the minlog10() "signed", which means that it will be made negative if the logFC is also negative
  df_merge$signed_minlogFDR_female[df_merge$logFC.y<0] = -df_merge$signed_minlogFDR_female[df_merge$logFC.y<0]
  
  df = df_merge[, c("signed_minlogFDR_male", "signed_minlogFDR_female", "name")] #only select the minlogFDR columns and the protein names
  names(df) <- c("y","x","name") #rename columns
  scatterplot_FDR_male_female(data = df, #use function to make the plot
                              max.overlaps = 30, 
                              labels = labels$imp_MinProb_age_cov_only_male,
                              main_title = paste0("scatterplot significant genes in males and females\n", title))

  ggsave("plots/paper/scatterplot_sex_stratified_age_cov.pdf", #save the plot
                     width = 11/2, height = 3, units = "in")

Visualization 5a: GSEA with clusterprofiler package

#the function for the GSEA is quite big. This is because within the function we:
# - set the background
# - run the GSEA twice: with FDR cut-off and without (the results without cut-off are the ones we save, the results with are for the figures)
# - select the top 10 results if there are no results passing the FDR cut-off
# - change the pathway names layout, since it is all in caps and mentions the ontology with every pathway name which can be removed
# - create labels for the barplot later on, that counts the number of genes from the pathway and the number that we detect in our analysis
# - plot a barplot 
# - plot a large cnetplot with labels
# - plot a cnetplot without labels

#libraries needed for the GSEA analysis
library(clusterProfiler)
library(enrichplot)
library(ggplot2)
library(msigdbr)
library(dichromat)
library(stringr)
redblue<-colorRampPalette(c("red","blue"))

#function for performing GSEA with clusterprofiler package and creating the corresponding plots
clusterprofiler_gsea = function(data, ont, title, alpha = 0.05, breaks = c(0.001,0.01,0.05)){
                  
  #perform gsea
          
          dg = sort(data, decreasing = TRUE)  #sort proteins on decreasing log-fold change (required for gsea)
        
      #create background according to the ontology used for the analysis    
          if(ont != "KEGG"){
            bg <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = ont) %>% 
                dplyr::select(gs_name, gene_symbol)
              bg <- bg[bg$gene_symbol %in% names(dg), ]
          }else{
            bg <- msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:KEGG") %>% 
                dplyr::select(gs_name, gene_symbol)
              bg <- bg[bg$gene_symbol %in% names(dg), ]
          }
        
        #the gsea analysis without cut-off   
        gse = GSEA(geneList=dg, #performing the gsea
             nPermSimple = 100000, 
             minGSSize = 3, #minimum gene set size
             maxGSSize = 800, #maximum gene set size
             pvalueCutoff = 1, #we don't select for specific p-value yet
             verbose = FALSE, 
             TERM2GENE = bg, #background
             pAdjustMethod = "BH") #benjamini hochberg correction
        
        #the gsea analysis with cut-off   
        gse2 = GSEA(geneList=dg, #performing the gsea
             nPermSimple = 100000, 
             minGSSize = 3, #minimum gene set size
             maxGSSize = 800, #maximum gene set size
             pvalueCutoff = alpha, 
             verbose = FALSE, 
             TERM2GENE = bg, #background
             pAdjustMethod = "BH") #benjamini hochberg correction
        
  #process gsea results  
        
          #if we have no results with less than 0.1 FDR, than we take the best 10 results
          gse_result = gse@result #from the gsea file we only use the results section to work with
          if (min(gse_result$p.adjust)<=alpha) {        #take only pathways with a FDR of 0.1 or lower
              gse_result_top = gse_result[gse_result$p.adjust<=alpha,] 
              } else {
                if(nrow(gse_result)<10){
                  gse_result_top = gse_result
                } else {
                  gse_result_top = gse_result[1:10,]
                }
              }
          
          # prettify description text - to lower case and remove ontology term at the start of each pathway name
          gse_result_top$Description = chartr("_", " ", gse_result_top$Description)
          gse_result_top$Description = tolower(gse_result_top$Description)
          if(ont!="KEGG"){
              gse_result_top$Description = sub('^\\w+\\s', '', gse_result_top$Description)
          }
          #sort results on enrichment score
          gse_result_top$Description <- factor(gse_result_top$Description, 
                                               levels = gse_result_top$Description[order(gse_result_top$enrichmentScore, 
                                                                                         decreasing = FALSE)])
          #create labels for barplot
          gse_result_top$ngenes = rep(NA, nrow(gse_result_top)) #make empty vectors for values
          gse_result_top$geom_labels = rep(NA, nrow(gse_result_top))
          for(o in 1:nrow(gse_result_top)){
            #full number of genes by counting words separated by /
            gse_result_top$ngenes[o] = length(unlist(strsplit(gse_result_top$core_enrichment[o], 
                                                              split='/', 
                                                              fixed=TRUE))) 
            #paste gene number + set size and significance level into label vector
            gse_result_top$geom_labels[o] = paste0(gse_result_top$ngenes[o],
                                                   "/", 
                                                   gse_result_top$setSize[o])} 

  #plot figure individually

       barplot = ggplot(data=gse_result_top, 
            aes(x=Description, y=gse_result_top$enrichmentScore, fill = p.adjust)) +
            geom_bar(stat="identity") +
            coord_flip() +
            scale_fill_gradientn(colours= redblue(255), 
                                 breaks=breaks,
                                 limits=c(0,alpha)) +
            theme(panel.grid.major = element_blank(), 
                  panel.grid.minor = element_blank(), 
                  axis.title.y=element_blank(),
                  panel.background = element_blank(),
                  text = element_text(size = 13, family="sans"),
                  axis.line = element_line(colour = "black")) +
                  labs(
                  title=title,
                       y ="Enrichment Score") +
            geom_text(aes(label = gse_result_top$geom_labels), colour="white", 
                      position = position_stack(vjust = 0.5)) + 
            guides(fill=guide_colourbar(title="FDR")) +  # Modify labels of ggplot2 barplot
            scale_x_discrete(labels = function(x) str_wrap(x, width = 40)) +
            ylim(-1, 1)

      if(nrow(gse2@result)>1){ #only run this with significant results
        #cnetplots
            #large with labels
            cnetplot_labels = cnetplot(gse2, node_label = 'all', showCategory = 1500, color.params = list(foldChange = dg))  +
              ggtitle(title)
            
            #small without labels
            cnetplot_no_labels = cnetplot(gse2, node_label = 'none', showCategory = 1500, color.params = list(foldChange = dg))  +
              ggtitle(title)
  
      }else{
        cnetplot_labels = "no_significant_results"
        cnetplot_no_labels = "no_signficant_results"
        print(paste0(title, " has no significant results and therefore no cnetplot was made"))
      }
          return(
            list(
              top_results = gse_result_top, 
              all_results = gse@result, 
              barplot = barplot,
              cnetplot_labels = cnetplot_labels,
              cnetplot_no_labels = cnetplot_no_labels))
  }

##### PERFORMING THE GSEA WITH THE FUNCTION ABOVE

      #create special directory for these plots
      dir.create(file.path(getwd(),'plots/gsea_clusterprofiler'), showWarnings = FALSE)
      
      #the ontologies to test:
      ontologies = c("BP", "CC", "MF", "KEGG") #this is for input in the function
      ontologies_text = c("Biological Process", "Cellular Component", "Molecular Function", "KEGG") #this is to spell it out in the figures
      #lists for the results
      gsea_results = list() 
      barplots_gsea = list()
      gsea_results_tables = list()
      cnetplots_labels = list()
      cnetplots_nolabels = list()
      
      l = 1
      
      for(i in 1:length(res_MinProb)){ #loop to perform GSEA on all MinProb DEx results
        for(j in 1:length(ontologies)){ #loop to perform GSEA on all ontologies
          title = paste0("GSEA_clusterprofiler_",names(res_MinProb)[i], "_", ontologies[j]) #title to track where in the loop we are
          
          #construct a named vector that is a signed minlog10(FDR)
          p = res_MinProb[[i]]$fdr 
          p = -log10(p)
          logfc_name = colnames(res_MinProb[[i]])[grep("diff", colnames(res_MinProb[[i]]))]
          p[res_MinProb[[i]][,logfc_name]<0] = -p[res_MinProb[[i]][,logfc_name]<0]
          names(p) = res_MinProb[[i]]$name
          
          #use the named vector to rank the proteins for the GSEA function
          suppressWarnings({
            gsea_results[[l]] = clusterprofiler_gsea(data = p, ont = ontologies[j], title, alpha = 0.05, breaks = c(0.001,0.01,0.05))
          })
          
          # save all the different parts of the results in here
          barplots_gsea[[l]] = gsea_results[[l]]$barplot
          gsea_results_tables[[l]] = gsea_results[[l]]$all_results
          cnetplots_labels[[l]] = gsea_results[[l]]$cnetplot_labels
          cnetplots_nolabels[[l]] = gsea_results[[l]]$cnetplot_no_labels
          
          #give all the list elements a name
          names(gsea_results)[l] = title
          names(barplots_gsea)[l] = title
          names(gsea_results_tables)[l] = title
          names(cnetplots_labels)[l] = title
          names(cnetplots_nolabels)[l] = title
          
          print(l)
          print(title)
          l = l+1
        }
      }
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 1
## [1] "GSEA_clusterprofiler_no_cov_all_patients_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_no_cov_all_patients_CC has no significant results and therefore no cnetplot was made"
## [1] 2
## [1] "GSEA_clusterprofiler_no_cov_all_patients_CC"
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 3
## [1] "GSEA_clusterprofiler_no_cov_all_patients_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_no_cov_all_patients_KEGG has no significant results and therefore no cnetplot was made"
## [1] 4
## [1] "GSEA_clusterprofiler_no_cov_all_patients_KEGG"
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 5
## [1] "GSEA_clusterprofiler_no_cov_only_female_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_no_cov_only_female_CC has no significant results and therefore no cnetplot was made"
## [1] 6
## [1] "GSEA_clusterprofiler_no_cov_only_female_CC"
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 7
## [1] "GSEA_clusterprofiler_no_cov_only_female_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_no_cov_only_female_KEGG has no significant results and therefore no cnetplot was made"
## [1] 8
## [1] "GSEA_clusterprofiler_no_cov_only_female_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_no_cov_only_male_BP has no significant results and therefore no cnetplot was made"
## [1] 9
## [1] "GSEA_clusterprofiler_no_cov_only_male_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_no_cov_only_male_CC has no significant results and therefore no cnetplot was made"
## [1] 10
## [1] "GSEA_clusterprofiler_no_cov_only_male_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_no_cov_only_male_MF has no significant results and therefore no cnetplot was made"
## [1] 11
## [1] "GSEA_clusterprofiler_no_cov_only_male_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_no_cov_only_male_KEGG has no significant results and therefore no cnetplot was made"
## [1] 12
## [1] "GSEA_clusterprofiler_no_cov_only_male_KEGG"
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 13
## [1] "GSEA_clusterprofiler_age_cov_all_patients_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_cov_all_patients_CC has no significant results and therefore no cnetplot was made"
## [1] 14
## [1] "GSEA_clusterprofiler_age_cov_all_patients_CC"
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 15
## [1] "GSEA_clusterprofiler_age_cov_all_patients_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_cov_all_patients_KEGG has no significant results and therefore no cnetplot was made"
## [1] 16
## [1] "GSEA_clusterprofiler_age_cov_all_patients_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_cov_only_female_BP has no significant results and therefore no cnetplot was made"
## [1] 17
## [1] "GSEA_clusterprofiler_age_cov_only_female_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_cov_only_female_CC has no significant results and therefore no cnetplot was made"
## [1] 18
## [1] "GSEA_clusterprofiler_age_cov_only_female_CC"
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 19
## [1] "GSEA_clusterprofiler_age_cov_only_female_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_cov_only_female_KEGG has no significant results and therefore no cnetplot was made"
## [1] 20
## [1] "GSEA_clusterprofiler_age_cov_only_female_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_cov_only_male_BP has no significant results and therefore no cnetplot was made"
## [1] 21
## [1] "GSEA_clusterprofiler_age_cov_only_male_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_cov_only_male_CC has no significant results and therefore no cnetplot was made"
## [1] 22
## [1] "GSEA_clusterprofiler_age_cov_only_male_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_cov_only_male_MF has no significant results and therefore no cnetplot was made"
## [1] 23
## [1] "GSEA_clusterprofiler_age_cov_only_male_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_cov_only_male_KEGG has no significant results and therefore no cnetplot was made"
## [1] 24
## [1] "GSEA_clusterprofiler_age_cov_only_male_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_sex_cov_all_patients_BP has no significant results and therefore no cnetplot was made"
## [1] 25
## [1] "GSEA_clusterprofiler_sex_cov_all_patients_BP"
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 26
## [1] "GSEA_clusterprofiler_sex_cov_all_patients_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_sex_cov_all_patients_MF has no significant results and therefore no cnetplot was made"
## [1] 27
## [1] "GSEA_clusterprofiler_sex_cov_all_patients_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_sex_cov_all_patients_KEGG has no significant results and therefore no cnetplot was made"
## [1] 28
## [1] "GSEA_clusterprofiler_sex_cov_all_patients_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_sex_cov_all_patients_BP has no significant results and therefore no cnetplot was made"
## [1] 29
## [1] "GSEA_clusterprofiler_age_sex_cov_all_patients_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_sex_cov_all_patients_CC has no significant results and therefore no cnetplot was made"
## [1] 30
## [1] "GSEA_clusterprofiler_age_sex_cov_all_patients_CC"
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 31
## [1] "GSEA_clusterprofiler_age_sex_cov_all_patients_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_age_sex_cov_all_patients_KEGG has no significant results and therefore no cnetplot was made"
## [1] 32
## [1] "GSEA_clusterprofiler_age_sex_cov_all_patients_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_all_patients_BP has no significant results and therefore no cnetplot was made"
## [1] 33
## [1] "GSEA_clusterprofiler_ALS_no_cov_all_patients_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_all_patients_CC has no significant results and therefore no cnetplot was made"
## [1] 34
## [1] "GSEA_clusterprofiler_ALS_no_cov_all_patients_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_all_patients_MF has no significant results and therefore no cnetplot was made"
## [1] 35
## [1] "GSEA_clusterprofiler_ALS_no_cov_all_patients_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_all_patients_KEGG has no significant results and therefore no cnetplot was made"
## [1] 36
## [1] "GSEA_clusterprofiler_ALS_no_cov_all_patients_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_female_BP has no significant results and therefore no cnetplot was made"
## [1] 37
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_female_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_female_CC has no significant results and therefore no cnetplot was made"
## [1] 38
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_female_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_female_MF has no significant results and therefore no cnetplot was made"
## [1] 39
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_female_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_female_KEGG has no significant results and therefore no cnetplot was made"
## [1] 40
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_female_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_male_BP has no significant results and therefore no cnetplot was made"
## [1] 41
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_male_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_male_CC has no significant results and therefore no cnetplot was made"
## [1] 42
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_male_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_male_MF has no significant results and therefore no cnetplot was made"
## [1] 43
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_male_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_male_KEGG has no significant results and therefore no cnetplot was made"
## [1] 44
## [1] "GSEA_clusterprofiler_ALS_no_cov_only_male_KEGG"
## [1] "GSEA_clusterprofiler_ALS_age_cov_all_patients_BP has no significant results and therefore no cnetplot was made"
## [1] 45
## [1] "GSEA_clusterprofiler_ALS_age_cov_all_patients_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_all_patients_CC has no significant results and therefore no cnetplot was made"
## [1] 46
## [1] "GSEA_clusterprofiler_ALS_age_cov_all_patients_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_all_patients_MF has no significant results and therefore no cnetplot was made"
## [1] 47
## [1] "GSEA_clusterprofiler_ALS_age_cov_all_patients_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_all_patients_KEGG has no significant results and therefore no cnetplot was made"
## [1] 48
## [1] "GSEA_clusterprofiler_ALS_age_cov_all_patients_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_female_BP has no significant results and therefore no cnetplot was made"
## [1] 49
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_female_BP"
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
## [1] 50
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_female_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_female_MF has no significant results and therefore no cnetplot was made"
## [1] 51
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_female_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_female_KEGG has no significant results and therefore no cnetplot was made"
## [1] 52
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_female_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_male_BP has no significant results and therefore no cnetplot was made"
## [1] 53
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_male_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_male_CC has no significant results and therefore no cnetplot was made"
## [1] 54
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_male_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_male_MF has no significant results and therefore no cnetplot was made"
## [1] 55
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_male_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_male_KEGG has no significant results and therefore no cnetplot was made"
## [1] 56
## [1] "GSEA_clusterprofiler_ALS_age_cov_only_male_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_sex_cov_all_patients_BP has no significant results and therefore no cnetplot was made"
## [1] 57
## [1] "GSEA_clusterprofiler_ALS_sex_cov_all_patients_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_sex_cov_all_patients_CC has no significant results and therefore no cnetplot was made"
## [1] 58
## [1] "GSEA_clusterprofiler_ALS_sex_cov_all_patients_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_sex_cov_all_patients_MF has no significant results and therefore no cnetplot was made"
## [1] 59
## [1] "GSEA_clusterprofiler_ALS_sex_cov_all_patients_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_sex_cov_all_patients_KEGG has no significant results and therefore no cnetplot was made"
## [1] 60
## [1] "GSEA_clusterprofiler_ALS_sex_cov_all_patients_KEGG"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_sex_cov_all_patients_BP has no significant results and therefore no cnetplot was made"
## [1] 61
## [1] "GSEA_clusterprofiler_ALS_age_sex_cov_all_patients_BP"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_sex_cov_all_patients_CC has no significant results and therefore no cnetplot was made"
## [1] 62
## [1] "GSEA_clusterprofiler_ALS_age_sex_cov_all_patients_CC"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_sex_cov_all_patients_MF has no significant results and therefore no cnetplot was made"
## [1] 63
## [1] "GSEA_clusterprofiler_ALS_age_sex_cov_all_patients_MF"
## no term enriched under specific pvalueCutoff...
## [1] "GSEA_clusterprofiler_ALS_age_sex_cov_all_patients_KEGG has no significant results and therefore no cnetplot was made"
## [1] 64
## [1] "GSEA_clusterprofiler_ALS_age_sex_cov_all_patients_KEGG"
      #save results
      saveRDS(gsea_results, file = "results/GSEA_clusterprofiler_all_results.rds")

      #save the results in an excel file
      names(gsea_results_tables) = gsub("GSEA_clusterprofiler_", "", names(gsea_results_tables)) #make the names shorter for the excel
      names(gsea_results_tables) = gsub("female", "f", names(gsea_results_tables))
      names(gsea_results_tables) = gsub("male", "m", names(gsea_results_tables))
      names(gsea_results_tables) = gsub("patients", "pats", names(gsea_results_tables))
      write_xlsx(gsea_results_tables, path = "results/GSEA_results_MinProb.xlsx") #save as excel in results folder
      

      #now we have large lists with all the barplots and cnetplots
      #you can choose which barplots or cnetplots you want to plot, and plot them in a grid
      #then you can also save them with the ggsave() function
      
      #EXAMPLE
      ggarrange(plotlist = barplots_gsea[1:4], ncol = 2, nrow = 2) #take only the first 4 list items from the barplots_gsea list

      ggsave("plots/barplots_GSEA_example.pdf", #save the plot
                     width = 10, height = 10, units = "in")

Visualization 5b: pathway barplot for paper

#the files I load here are manually annotated files. Each pathway is categorized. They originate from the original results but I randomized them
ann_GSEA_results = readRDS("data/shuffeld_GSEA_annotated_results.rds")

  for(i in 1:length(ann_GSEA_results)){
    order = as.character(unique(ann_GSEA_results[[i]]$category))
    ann_GSEA_results[[i]]$category = factor(ann_GSEA_results[[i]]$category, levels = order)
    
    ann_GSEA_results[[i]]$log10 = as.numeric(ann_GSEA_results[[i]]$log10)
    ann_GSEA_results[[i]]$enrichmentScore = as.numeric(ann_GSEA_results[[i]]$enrichmentScore)
    
    up = "up"
    down = "down"
    ann_GSEA_results[[i]]$comparison = rep(up, nrow(ann_GSEA_results[[i]]))
    ann_GSEA_results[[i]]$comparison[ann_GSEA_results[[i]]$enrichmentScore < 0] = down
    
  #BLACK FACETGRID    
      ggplot(ann_GSEA_results[[i]], aes(x = reorder(ID, abs(log10)), y = log10, fill = comparison)) +
        geom_bar(stat = "identity", position = "identity", color = "black", alpha = 0.7) +
        coord_flip() + 
        # Add labels and customize the plot
        labs(title = paste0("Bar Plot of signed log10(FDR)\n", names(ann_GSEA_results)[i]),
             x = "ID",
             y = "Signed log10(FDR)") +
        theme_few() +
        # Customize colors using the custom color palette
        scale_fill_manual(values = final_colours$volcano_plot, aesthetics = "fill") +
        # Adjust x-axis label font size
        #theme(axis.text.y = element_text(size = 7))
        theme(axis.text.y = element_blank()) +
        facet_grid(category ~ ., scales = "free", space = "free") +
        theme(strip.text.y = element_text(angle = 0)) + 
        theme(panel.spacing = unit(0.1, "lines"))
            
      ggsave(paste0("plots/paper/barplot_Lauras_annotation_", names(ann_GSEA_results)[i],"_facetgrid_black.pdf"),
             width = 5, 
             height = 1 + nrow(ann_GSEA_results[[i]])*0.06,
             units = "in")
  }

Visualization 6: subsampling analysis

library(IceR)

#parameters for the analysis below
    n_bs = 500 #number of bootstraps
    n_resamples = 66 #number of resamples
    set.seed(9) 
    covariates = "age_cov" #covariates
    covariates_f = ~0 + condition + age_cat #formula for covariates
    empty_m = as.data.frame(matrix(nrow = dim(data$imp_MinProb)[1], ncol = n_bs)) #generate empty matrix
    rownames(empty_m) = data[["imp_MinProb"]]@NAMES #set rownames empty matrix
    res_matrices = list(fdr_males = empty_m, fdr_females = empty_m, fc_males = empty_m, fc_females = empty_m) #put 4 empty matrices in a list

# Function to resample matrix columns with replacement
    resample_matrix <- function(matrix, num_samples) {
      num_columns <- ncol(matrix)
      resampled_indices <- sample(num_columns, size = num_samples, replace = TRUE)
      resampled_matrix <- matrix[, resampled_indices]
      return(resampled_matrix)
    }
  
#a bootstrapping loop to rerun the differential expression analysis the number of times we set before in the parameters
    for(i in 1:n_bs){ 
        set.seed(i) #different seed every time we run the function to get different randomization
        d = assay(data[["imp_MinProb"]]) #load the data
        sex = data[["imp_MinProb"]]$sex #load sex labels
        d_male = d[,sex == "Male"] #separate dataframe with males
        d_female = d[,sex == "Female"] #separate dataframe with females
        
        d_male_resample = resample_matrix(matrix = d_male, num_samples = n_resamples) # generate resampled matrix for males
        d_female_resample = resample_matrix(matrix = d_female, num_samples = n_resamples) # generate resampled matrix for females
        
        d_male_resample = as.data.frame(d_male_resample) #change from matrix format to dataframe
        d_female_resample = as.data.frame(d_female_resample)
        
        #test males
          t = LIMMA_analysis( #limma_analysis is a function that uses the differential expression analysis from the limma package
            data = d_male_resample,
            #the disease status is stored in the colnames so we use the code below to extract that
            assignments = unlist(str_split(colnames(d_male_resample), pattern = "_"))[seq(from = 1, to = ncol(d_male_resample)*2, by = 2)], 
            contrast = "als_vs_ctrl"
          )
          # Order dataframe based on row names
          t <- t[order(rownames(t)), ]
          t$fdr = p.adjust(t$P.Value, method="BH")
          res_matrices$fdr_males[,i] = t$fdr
          res_matrices$fc_males[,i] = t$logFC
          
        #test females
          t = LIMMA_analysis(
            data = d_female_resample,
            assignments = unlist(str_split(colnames(d_female_resample), pattern = "_"))[seq(from = 1, to = ncol(d_female_resample)*2, by = 2)],
            contrast = "als_vs_ctrl"
          )
          # Order dataframe based on row names
          t <- t[order(rownames(t)), ]
          t$fdr = p.adjust(t$P.Value, method="BH")
          res_matrices$fdr_females[,i] = t$fdr
          res_matrices$fc_females[,i] = t$logFC
        
    }

#make a dataframe with the means of the results
    mean_res_bs = data.frame(
      FDR.y = rowMeans(res_matrices$fdr_males),
      FDR.x = rowMeans(res_matrices$fdr_females),
      logFC.y = rowMeans(res_matrices$fc_males),
      logFC.x = rowMeans(res_matrices$fc_females))
    

#VISUALIZATION 1: scatterplot with plotting the means of each protein for male and female

    #this function codes for a scatterplot with the results
    scatterplot_FDR_male_female = function(data,
                                           x_name = "females", 
                                           y_name = "males", 
                                           cut_off = -log10(0.1),
                                           main_title, 
                                           labels,
                                           max.overlaps = 10, 
                                           lab_x = "signed -log10(FDR) for females",
                                           lab_y = "signed -log10(FDR) for males",
                                           labels_T_F = T){
      data$omic_type = rep("ns", nrow(data))
      text_y = paste0("significant in ", y_name)
      text_x = paste0("significant in ", x_name)
      data$omic_type[abs(data$y) >= cut_off] = text_y
      data$omic_type[abs(data$x) >= cut_off] = text_x
      data$omic_type[(abs(data$x) >= cut_off) & (abs(data$y) >= cut_off)] = "significant in both"
      cols <- c("x" = "#E78AC3", "y" = "#66C2A5", "ns" = "grey", "significant in both" = "mediumpurple1") 
      attributes(cols)$names[1] = text_x
      attributes(cols)$names[2] = text_y
    
     plot = ggplot(data, aes(x,y)) +
      geom_point(aes(colour = omic_type),
                   alpha = 0.5,
                   shape = 16,
                   size = 2) +
      geom_point(data = filter(data, abs(y) >= cut_off | abs(x) >= cut_off),
                   aes(colour = omic_type), 
                   alpha = 0.5, 
                   shape = 16,
                   size = 3) + 
      geom_hline(yintercept = cut_off, linetype = "dashed", colour = "grey40") +
      geom_hline(yintercept = -cut_off, linetype = "dashed", colour = "grey40") +
      geom_vline(xintercept = cut_off, linetype = "dashed", colour = "grey40") +
      geom_vline(xintercept = -cut_off, linetype = "dashed", colour = "grey40") +
      geom_hline(yintercept = 0, linetype = "dashed", colour = "grey80") +
      geom_vline(xintercept = 0, linetype = "dashed", colour = "grey80") +
         geom_text_repel(data = filter(data, name %in% labels),
                         aes(label = name),
                         force = 1,
                        hjust = 1,
                         nudge_x = - 0.05,
                        nudge_y = 0.05,
                        #direction = "y",
                        max.overlaps = max.overlaps,
                        segment.size = 0.2,
                         size = 2)  +
        scale_colour_manual(values = cols) + 
        scale_fill_manual(values = cols) +
      labs(title = main_title,
             x = lab_x,
             y = lab_y,
             colour = "Differential \nExpression") +
        theme_few() + # Select theme with a white background  
        # theme(axis.title.y = element_text(size = 10),
        #       axis.title.x = element_text(size = 10),
        #       axis.text = element_text(size = 8),
        #       plot.title = element_text(size = 10, hjust = 0.5),
        #       text = element_text(size = 10)) +
        annotate("text", x = -0.5, y = 0.5, label = 
                   paste0(sum(data$omic_type==text_y), " ", text_y,"\n", 
                          sum(data$omic_type==text_x), " ", text_x,"\n", 
                          sum(data$omic_type=="significant in both"), " significant in both"))
    
      return(plot)
    
    }
    
    #plot results DE with the scatterplot function above - male-female
    
      title = paste0("MinProb_age_cov_", n_bs, "_times_bootstrapping_n=", n_resamples)
    
      mean_res_bs$signed_minlogFDR_male = -log10(mean_res_bs$FDR.x)
      mean_res_bs$signed_minlogFDR_female = -log10(mean_res_bs$FDR.y)
      mean_res_bs$signed_minlogFDR_male[mean_res_bs$logFC.x<0] = -mean_res_bs$signed_minlogFDR_male[mean_res_bs$logFC.x<0]
      mean_res_bs$signed_minlogFDR_female[mean_res_bs$logFC.y<0] = -mean_res_bs$signed_minlogFDR_female[mean_res_bs$logFC.y<0]
      mean_res_bs$name = rownames(mean_res_bs)
      
      df = mean_res_bs[, c("signed_minlogFDR_male", "signed_minlogFDR_female", "name")]
      names(df) <- c("y","x","name")
      scatterplot_FDR_male_female(data = df, 
                                  max.overlaps = 30, 
                                  labels = labels,
                                  main_title = paste0("scatterplot significant genes in males and females\n", title))

      ggsave("plots/paper/scatterplot_bootstrapping_sex_stratified_age_cov.pdf", 
                         width = 10, height = 6, units = "in")
      
      
#VISUALIZATION 2: violin plot with number of significant hits
      
      n_sig = data.frame(
        n_sig_male = apply(X = res_matrices$fdr_males, MARGIN = 2, FUN = function(x) sum(x<=0.05)),
        n_sig_female = apply(X = res_matrices$fdr_females, MARGIN = 2, FUN = function(x) sum(x<=0.05))
      )
      #testing the different mean number of significant hits with a t-test
      t = t.test(n_sig$n_sig_female, n_sig$n_sig_male) 
      
      
      # Reshape the dataframe from wide to long format
      library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following object is masked from 'package:reshape2':
## 
##     smiths
      n_sig_long <- gather(n_sig, key = "variable", value = "value")
      n_sig_long$variable = as.factor(n_sig_long$variable)
      levels(n_sig_long$variable) = c("Female", "Male")
    
    # Create violin using ggplot
    ggviolin(n_sig_long, x = "variable", y = "value", fill = "variable", add = "boxplot") +
                          labs(title = paste0("Violin plot significant genes in males and females\n", title),
                               x = "Male or Female",
                               y = "Number of significant proteins in test") +
                          scale_fill_manual(values=final_colours$male_female) +
                          theme_few() +
                          annotate("text", x = 1.5, y = 400, label = paste0("p-value = ", round(t$p.value, 5)))

    ggsave("plots/paper/violin_bootstrapping_sex_stratified_age_cov.pdf", 
                         width = 4, height = 3, units = "in")

R and packages versions

sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidyr_1.3.0                 IceR_0.9.16                
##  [3] ggpubr_0.6.0                writexl_1.4.2              
##  [5] viridis_0.6.4               viridisLite_0.4.2          
##  [7] readxl_1.4.3                readr_2.1.4                
##  [9] RColorBrewer_1.1-3          org.Hs.eg.db_3.16.0        
## [11] AnnotationDbi_1.60.2        data.table_1.14.8          
## [13] SummarizedExperiment_1.28.0 Biobase_2.58.0             
## [15] GenomicRanges_1.50.2        GenomeInfoDb_1.34.9        
## [17] IRanges_2.32.0              S4Vectors_0.36.2           
## [19] BiocGenerics_0.44.0         MatrixGenerics_1.10.0      
## [21] naniar_1.0.0                DEP_1.20.0                 
## [23] cowplot_1.1.1               umap_0.2.10.0              
## [25] reshape2_1.4.4              ggrepel_0.9.3              
## [27] dplyr_1.1.2                 stringr_1.5.0              
## [29] dichromat_2.0-0.1           msigdbr_7.5.1              
## [31] enrichplot_1.18.4           clusterProfiler_4.6.2      
## [33] wesanderson_0.3.6           matrixStats_1.0.0          
## [35] ggplot2_3.4.3               pheatmap_1.0.12            
## [37] ggthemes_4.2.4             
## 
## loaded via a namespace (and not attached):
##   [1] ragg_1.2.5             visdat_0.6.0           bit64_4.0.5           
##   [4] knitr_1.43             DelayedArray_0.24.0    KEGGREST_1.38.0       
##   [7] RCurl_1.98-1.12        doParallel_1.0.17      generics_0.1.3        
##  [10] preprocessCore_1.60.2  RSQLite_2.3.1          shadowtext_0.1.2      
##  [13] bit_4.0.5              tzdb_0.4.0             httpuv_1.6.11         
##  [16] assertthat_0.2.1       xfun_0.40              hms_1.1.3             
##  [19] jquerylib_0.1.4        babelgene_22.9         evaluate_0.21         
##  [22] promises_1.2.1         fansi_1.0.4            igraph_1.5.1          
##  [25] DBI_1.1.3              htmlwidgets_1.6.2      purrr_1.0.2           
##  [28] ellipsis_0.3.2         RSpectra_0.16-1        ggnewscale_0.4.9      
##  [31] backports_1.4.1        vctrs_0.6.3            imputeLCMD_2.1        
##  [34] abind_1.4-5            cachem_1.0.8           withr_2.5.0           
##  [37] ggforce_0.4.1          HDO.db_0.99.1          treeio_1.22.0         
##  [40] fdrtool_1.2.17         cluster_2.1.4          DOSE_3.24.2           
##  [43] ape_5.7-1              lazyeval_0.2.2         crayon_1.5.2          
##  [46] pkgconfig_2.0.3        labeling_0.4.3         tweenr_2.0.2          
##  [49] nlme_3.1-163           ProtGenerics_1.30.0    rlang_1.1.1           
##  [52] lifecycle_1.0.3        sandwich_3.0-2         downloader_0.4        
##  [55] affyio_1.68.0          randomForest_4.7-1.1   cellranger_1.1.0      
##  [58] polyclip_1.10-4        Matrix_1.6-1           aplot_0.2.0           
##  [61] carData_3.0-5          zoo_1.8-12             GlobalOptions_0.1.2   
##  [64] png_0.1-8              rjson_0.2.21           mzR_2.32.0            
##  [67] bitops_1.0-7           shinydashboard_0.7.2   gson_0.1.0            
##  [70] Biostrings_2.66.0      blob_1.2.4             shape_1.4.6           
##  [73] qvalue_2.30.0          rstatix_0.7.2          gridGraphics_0.5-1    
##  [76] tmvtnorm_1.5           ggsignif_0.6.4         scales_1.2.1          
##  [79] memoise_2.0.1          magrittr_2.0.3         plyr_1.8.8            
##  [82] hexbin_1.28.3          zlibbioc_1.44.0        compiler_4.2.3        
##  [85] scatterpie_0.2.1       pcaMethods_1.90.0      clue_0.3-64           
##  [88] cli_3.6.1              affy_1.76.0            XVector_0.38.0        
##  [91] patchwork_1.1.3        MASS_7.3-60            tidyselect_1.2.0      
##  [94] vsn_3.66.0             stringi_1.7.12         textshaping_0.3.6     
##  [97] highr_0.10             yaml_2.3.7             GOSemSim_2.24.0       
## [100] askpass_1.1            norm_1.0-11.1          MALDIquant_1.22.1     
## [103] grid_4.2.3             sass_0.4.7             fastmatch_1.1-4       
## [106] tools_4.2.3            parallel_4.2.3         circlize_0.4.15       
## [109] rstudioapi_0.15.0      MsCoreUtils_1.10.0     foreach_1.5.2         
## [112] gridExtra_2.3          farver_2.1.1           mzID_1.36.0           
## [115] ggraph_2.1.0           digest_0.6.33          BiocManager_1.30.22   
## [118] shiny_1.7.5            Rcpp_1.0.11            car_3.1-2             
## [121] broom_1.0.5            later_1.3.1            ncdf4_1.21            
## [124] httr_1.4.7             MSnbase_2.24.2         ComplexHeatmap_2.14.0 
## [127] colorspace_2.1-0       XML_3.99-0.14          reticulate_1.31       
## [130] splines_4.2.3          yulab.utils_0.0.8      tidytree_0.4.5        
## [133] graphlayouts_1.0.0     gmm_1.8                ggplotify_0.1.2       
## [136] systemfonts_1.0.4      xtable_1.8-4           jsonlite_1.8.7        
## [139] ggtree_3.6.2           tidygraph_1.2.3        ggfun_0.1.2           
## [142] R6_2.5.1               pillar_1.9.0           htmltools_0.5.6       
## [145] mime_0.12              glue_1.6.2             fastmap_1.1.1         
## [148] DT_0.29                BiocParallel_1.32.6    codetools_0.2-19      
## [151] fgsea_1.24.0           mvtnorm_1.2-3          utf8_1.2.3            
## [154] lattice_0.21-8         bslib_0.5.1            tibble_3.2.1          
## [157] GO.db_3.16.0           openssl_2.1.0          limma_3.54.2          
## [160] rmarkdown_2.24         munsell_0.5.0          GetoptLong_1.0.5      
## [163] GenomeInfoDbData_1.2.9 iterators_1.0.14       impute_1.72.3         
## [166] gtable_0.3.4